From 912d545921b69d61bb0a3355177fcda0935bad01 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Wed, 15 Nov 2023 14:05:26 +0100 Subject: [PATCH 01/17] start implementing interpolation join with dataframe api --- skrub/_dataframe/__init__.py | 3 + skrub/_dataframe/_common.py | 9 +++ skrub/_dataframe/_pandas.py | 60 ++++++++++++++++++- skrub/_dataframe/_polars.py | 71 ++++++++++++++++++++++- skrub/_interpolation_joiner.py | 102 ++++++++++++++++++++++----------- skrub/_join_utils.py | 3 +- 6 files changed, 211 insertions(+), 37 deletions(-) create mode 100644 skrub/_dataframe/_common.py diff --git a/skrub/_dataframe/__init__.py b/skrub/_dataframe/__init__.py index e69de29bb..6dec0a176 100644 --- a/skrub/_dataframe/__init__.py +++ b/skrub/_dataframe/__init__.py @@ -0,0 +1,3 @@ +from ._namespace import get_df_namespace + +__all__ = ["get_df_namespace"] diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py new file mode 100644 index 000000000..8d7c6a662 --- /dev/null +++ b/skrub/_dataframe/_common.py @@ -0,0 +1,9 @@ +import enum + + +class Selector(enum.Enum): + ALL = enum.auto() + NONE = enum.auto() + NUMERIC = enum.auto() + CATEGORICAL = enum.auto() + STRING = enum.auto() diff --git a/skrub/_dataframe/_pandas.py b/skrub/_dataframe/_pandas.py index 933e8e5b1..499bdd446 100644 --- a/skrub/_dataframe/_pandas.py +++ b/skrub/_dataframe/_pandas.py @@ -9,6 +9,24 @@ from skrub._utils import atleast_1d_or_none +from ._common import Selector + +__all__ = [ + "make_dataframe", + "make_series", + "aggregate", + "join", + "split_num_categ_cols", + "select", + "drop", + "Selector", + "concatenate", + "any_rowwise", + "collect", + "to_numpy", + "to_pandas", +] + def make_dataframe(X, index=None): """Convert an dictionary of columns into a Pandas dataframe. @@ -329,4 +347,44 @@ def split_num_categ_cols(table): def select(dataframe, columns): - return dataframe[columns] + if not isinstance(columns, Selector): + return dataframe[columns] + if columns is Selector.ALL: + return dataframe + elif columns is Selector.NONE: + return dataframe[[]] + elif columns is Selector.NUMERIC: + return dataframe.select_dtypes("number") + elif columns is Selector.CATEGORICAL: + return dataframe.select_dtypes(["object", "string", "category"]) + elif columns is Selector.STRING: + return dataframe.select_dtypes(["string"]) + # we have covered all items in the enumeration + assert False + + +def drop(dataframe, columns): + return dataframe.drop(select(dataframe, columns).columns.values) + + +def any_rowwise(dataframe): + return dataframe.any(axis=1) + + +def concatenate(dataframe, *other_dataframes): + other_dataframes = [ + df.set_axis(dataframe.index, axis="index") for df in other_dataframes + ] + return pd.concat([dataframe] + list(other_dataframes), axis=1) + + +def collect(dataframe): + return dataframe + + +def to_pandas(dataframe): + return dataframe + + +def to_numpy(dataframe): + return dataframe.to_numpy() diff --git a/skrub/_dataframe/_polars.py b/skrub/_dataframe/_polars.py index 714973ede..d17af0dd4 100644 --- a/skrub/_dataframe/_polars.py +++ b/skrub/_dataframe/_polars.py @@ -13,6 +13,25 @@ from skrub._utils import atleast_1d_or_none +from ._common import Selector + +__all__ = [ + "POLARS_SETUP", + "make_dataframe", + "make_series", + "aggregate", + "join", + "split_num_categ_cols", + "select", + "drop", + "Selector", + "concatenate", + "any_rowwise", + "collect", + "to_numpy", + "to_pandas", +] + def make_dataframe(X, index=None): """Convert an dictionary of columns into a Polars dataframe. @@ -263,5 +282,55 @@ def split_num_categ_cols(table): return num_cols, categ_cols +def _check_selector(columns): + if not isinstance(columns, Selector): + return columns + if columns is Selector.ALL: + return cs.all() + elif columns is Selector.NONE: + return [] + elif columns is Selector.NUMERIC: + return cs.numeric() + elif columns is Selector.CATEGORICAL: + return cs.string(include_categorical=True) + elif columns is Selector.STRING: + return cs.string() + # we have covered all items in the enumeration + assert False + + def select(dataframe, columns): - return dataframe.select(columns) + return dataframe.select(_check_selector(columns)) + + +def drop(dataframe, columns): + return dataframe.drop(_check_selector(columns)) + + +def any_rowwise(dataframe): + return collect(dataframe.select(pl.any_horizontal(pl.all()))).get_column("any") + + +def concatenate(dataframe, *other_dataframes): + return pl.concat( + [collect(dataframe)] + [collect(df) for df in other_dataframes], + how="horizontal", + ) + + +def collect(dataframe): + if hasattr(dataframe, "collect"): + dataframe = dataframe.collect() + return dataframe + + +def to_pandas(dataframe): + if hasattr(dataframe, "collect"): + dataframe = dataframe.collect() + return dataframe.to_pandas() + + +def to_numpy(dataframe): + if hasattr(dataframe, "to_numpy"): + return dataframe.to_numpy() + return dataframe.collect().to_numpy() diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py index 92e9bf30b..8d5862f3a 100644 --- a/skrub/_interpolation_joiner.py +++ b/skrub/_interpolation_joiner.py @@ -2,7 +2,6 @@ import joblib import numpy as np -import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin, clone from sklearn.ensemble import ( HistGradientBoostingClassifier, @@ -11,6 +10,7 @@ from sklearn.utils._tags import _safe_tags from skrub import _join_utils, _utils +from skrub._dataframe import get_df_namespace from skrub._minhash_encoder import MinHashEncoder from skrub._table_vectorizer import TableVectorizer @@ -221,12 +221,16 @@ def fit(self, X, y=None): self._check_inputs() if X is not None: _join_utils.check_missing_columns(X, self._main_key, "'X' (the main table)") - key_values = self.vectorizer_.fit_transform(self.aux_table[self._aux_key]) + aux_table = self.aux_table.__dataframe_consortium_standard__() + # TODO avoid conversion to pandas when TableVectorizer supports it + ns, _ = get_df_namespace(self.aux_table) + df = ns.to_pandas(aux_table.select(*self._aux_key).dataframe) + key_values = self.vectorizer_.fit_transform(df) estimators = self._get_estimator_assignments() fit_results = joblib.Parallel(self.n_jobs)( joblib.delayed(_fit)( key_values, - self.aux_table[assignment["columns"]], + aux_table.select(*assignment["columns"]).dataframe, assignment["estimator"], propagate_exceptions=(self.on_estimator_failure == "raise"), ) @@ -281,45 +285,52 @@ def transform(self, X): The result of the join between `X` and inferred rows from ``self.aux_table``. """ - main_table = X + main_table = X.__dataframe_consortium_standard__() _join_utils.check_missing_columns( - main_table, self._main_key, "'X' (the main table)" + main_table.dataframe, self._main_key, "'X' (the main table)" ) - key_values = self.vectorizer_.transform( - main_table[self._main_key].set_axis(self._aux_key, axis="columns") + df = ( + main_table.select(*self._main_key) + .rename_columns(dict(zip(main_table.column_names, self._aux_key))) + .dataframe ) + # TODO avoid conversion to pandas when vectorizer supports it + ns, _ = get_df_namespace(df) + df = ns.to_pandas(df) + key_values = self.vectorizer_.transform(df) prediction_results = joblib.Parallel(self.n_jobs)( joblib.delayed(_predict)( key_values, assignment["columns"], assignment["estimator"], propagate_exceptions=(self.on_estimator_failure == "raise"), + ns=get_df_namespace(self.aux_table)[0], ) for assignment in self.estimators_ ) prediction_results = self._check_prediction_results(prediction_results) predictions = [res["predictions"] for res in prediction_results] predictions = _add_column_name_suffix(predictions, self.suffix) - for part in predictions: - part.index = main_table.index - return pd.concat([main_table] + predictions, axis=1) + ns, _ = get_df_namespace(self.aux_table) + return ns.concatenate(main_table.dataframe, *predictions) def _check_prediction_results(self, results): checked_results = [] failed_columns = [] + ns, _ = get_df_namespace(self.aux_table) for res in results: new_res = dict(**res) if res["failed"]: - if set(res["columns"]).issubset( - self.aux_table.select_dtypes("number").columns.values - ): - dtype = float - else: - dtype = object - pred = pd.DataFrame( - columns=res["columns"], - index=np.arange(res["shape"][0]), - dtype=dtype, + # TODO + # numeric_cols = list( + # ns.select(self.aux_table, ns.Selector.NUMERIC).columns + # ) + # if set(res["columns"]).issubset(numeric_cols): + # dtype = float + # else: + # dtype = object + pred = ns.make_dataframe( + {c: [None for _ in range(res["shape"][0])] for c in res["columns"]}, ) new_res["predictions"] = pred failed_columns.extend(res["columns"]) @@ -351,13 +362,16 @@ def _get_estimator_assignments(self): When the estimator does not handle multi-output, an estimator is fitted separately to each column. """ - aux_table = self.aux_table.drop(self._aux_key, axis=1) + aux_table = self.aux_table.__dataframe_consortium_standard__().drop_columns( + *self._aux_key + ) + ns, _ = get_df_namespace(aux_table.dataframe) assignments = [] - regression_table = aux_table.select_dtypes("number") + regression_table = ns.select(aux_table.dataframe, ns.Selector.NUMERIC) assignments.extend( _get_assignments_for_estimator(regression_table, self.regressor_) ) - classification_table = aux_table.select_dtypes(["object", "string", "category"]) + classification_table = ns.select(aux_table.dataframe, ns.Selector.CATEGORICAL) assignments.extend( _get_assignments_for_estimator(classification_table, self.classifier_) ) @@ -373,15 +387,19 @@ def _get_assignments_for_estimator(table, estimator): # estimator is empty (eg the estimator is the regressor and there are no # numerical columns), return an empty list -- no columns are assigned to # that estimator. - if table.empty: + table = table.__dataframe_consortium_standard__() + if not len(table.column_names): return [] if not _handles_multioutput(estimator): - return [{"columns": [col], "estimator": estimator} for col in table.columns] - columns_with_nulls = table.columns[table.isnull().any()] + return [ + {"columns": [col], "estimator": estimator} for col in table.column_names + ] + table = table.persist() + columns_with_nulls = [c for c in table.column_names if table.col(c).is_null().any()] assignments = [ {"columns": [col], "estimator": estimator} for col in columns_with_nulls ] - columns_without_nulls = list(set(table.columns).difference(columns_with_nulls)) + columns_without_nulls = list(set(table.column_names).difference(columns_with_nulls)) if columns_without_nulls: assignments.append({"columns": columns_without_nulls, "estimator": estimator}) return assignments @@ -392,14 +410,21 @@ def _handles_multioutput(estimator): def _fit(key_values, target_table, estimator, propagate_exceptions): + target_table = target_table.__dataframe_consortium_standard__() estimator = clone(estimator) - kept_rows = target_table.notnull().all(axis=1).to_numpy() + ns, _ = get_df_namespace(target_table.dataframe) + kept_rows = ~( + ns.any_rowwise(target_table.is_null().dataframe) + .__column_consortium_standard__() + .to_array() + ) key_values = key_values[kept_rows] - Y = target_table.to_numpy()[kept_rows] + target_table = target_table.persist() + Y = target_table.to_array(None)[kept_rows] # Estimators that expect a single output issue a DataConversionWarning if # passing a column vector rather than a 1-D array - if len(target_table.columns) == 1: + if len(target_table.column_names) == 1: Y = Y.ravel() failed = False try: @@ -409,10 +434,14 @@ def _fit(key_values, target_table, estimator, propagate_exceptions): raise failed = True estimator = None - return {"columns": target_table.columns, "estimator": estimator, "failed": failed} + return { + "columns": target_table.column_names, + "estimator": estimator, + "failed": failed, + } -def _predict(key_values, columns, estimator, propagate_exceptions): +def _predict(key_values, columns, estimator, propagate_exceptions, ns): failed = False try: Y_values = estimator.predict(key_values) @@ -423,7 +452,9 @@ def _predict(key_values, columns, estimator, propagate_exceptions): if failed: predictions = None else: - predictions = pd.DataFrame(data=Y_values, columns=columns) + predictions = ns.make_dataframe( + dict(zip(columns, np.atleast_2d(Y_values.T).T.T)) + ) return { "predictions": predictions, "failed": failed, @@ -437,5 +468,8 @@ def _add_column_name_suffix(dataframes, suffix): return dataframes renamed = [] for df in dataframes: - renamed.append(df.rename(columns={c: f"{c}{suffix}" for c in df.columns})) + df = df.__dataframe_consortium_standard__() + renamed.append( + df.rename_columns({c: f"{c}{suffix}" for c in df.column_names}).dataframe + ) return renamed diff --git a/skrub/_join_utils.py b/skrub/_join_utils.py index 7d4dd3432..6a92b5964 100644 --- a/skrub/_join_utils.py +++ b/skrub/_join_utils.py @@ -61,7 +61,8 @@ def check_missing_columns(table, key, table_name): table_name : str Name by which to refer to `table` in the error message if necessary. """ - missing_columns = set(key) - set(table.columns) + table = table.__dataframe_consortium_standard__() + missing_columns = set(key) - set(table.column_names) if not missing_columns: return raise ValueError( From cd3e9423a2cf8b7aab78b0f7c78d18fd6271e23e Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Wed, 15 Nov 2023 14:26:20 +0100 Subject: [PATCH 02/17] store schema rather than columns --- skrub/_interpolation_joiner.py | 43 +++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py index 8d5862f3a..1aa986b06 100644 --- a/skrub/_interpolation_joiner.py +++ b/skrub/_interpolation_joiner.py @@ -125,8 +125,8 @@ class InterpolationJoiner(TransformerMixin, BaseEstimator): estimators_ : list of dicts The estimators used to infer values to be joined. Each entry in this list is a dictionary with keys ``"estimator"`` (the fitted estimator) - and ``"columns"`` (the list of columns in ``aux_table`` that it is - trained to predict). + and ``"schema"`` (the names and data types of columns in ``aux_table`` + that it is trained to predict). See Also -------- @@ -230,7 +230,7 @@ def fit(self, X, y=None): fit_results = joblib.Parallel(self.n_jobs)( joblib.delayed(_fit)( key_values, - aux_table.select(*assignment["columns"]).dataframe, + aux_table.select(*assignment["schema"].keys()).dataframe, assignment["estimator"], propagate_exceptions=(self.on_estimator_failure == "raise"), ) @@ -258,7 +258,7 @@ def _check_fit_results(self, results): failed_columns = [] for res in results: if res["failed"]: - failed_columns.extend(res["columns"]) + failed_columns.extend(res["schema"].keys()) if not failed_columns: return successful_results warnings.warn( @@ -301,7 +301,7 @@ def transform(self, X): prediction_results = joblib.Parallel(self.n_jobs)( joblib.delayed(_predict)( key_values, - assignment["columns"], + assignment["schema"], assignment["estimator"], propagate_exceptions=(self.on_estimator_failure == "raise"), ns=get_df_namespace(self.aux_table)[0], @@ -330,10 +330,13 @@ def _check_prediction_results(self, results): # else: # dtype = object pred = ns.make_dataframe( - {c: [None for _ in range(res["shape"][0])] for c in res["columns"]}, + { + c: [None for _ in range(res["shape"][0])] + for c in res["schema"].keys() + }, ) new_res["predictions"] = pred - failed_columns.extend(res["columns"]) + failed_columns.extend(res["schema"].keys()) checked_results.append(new_res) if not failed_columns: return checked_results @@ -350,7 +353,7 @@ def _get_estimator_assignments(self): In many cases, a single estimator cannot handle all the target columns. This function groups columns that can be handled together and returns a - list of dictionaries, each with keys "columns" and "estimator". + list of dictionaries, each with keys "schema" and "estimator". Regression and classification targets are always handled separately. @@ -392,16 +395,23 @@ def _get_assignments_for_estimator(table, estimator): return [] if not _handles_multioutput(estimator): return [ - {"columns": [col], "estimator": estimator} for col in table.column_names + {"schema": {col: table.schema[col]}, "estimator": estimator} + for col in table.column_names ] table = table.persist() columns_with_nulls = [c for c in table.column_names if table.col(c).is_null().any()] assignments = [ - {"columns": [col], "estimator": estimator} for col in columns_with_nulls + {"schema": {col: table.schema[col]}, "estimator": estimator} + for col in columns_with_nulls ] columns_without_nulls = list(set(table.column_names).difference(columns_with_nulls)) if columns_without_nulls: - assignments.append({"columns": columns_without_nulls, "estimator": estimator}) + assignments.append( + { + "schema": {c: table.schema[c] for c in columns_without_nulls}, + "estimator": estimator, + } + ) return assignments @@ -435,13 +445,13 @@ def _fit(key_values, target_table, estimator, propagate_exceptions): failed = True estimator = None return { - "columns": target_table.column_names, + "schema": target_table.schema, "estimator": estimator, "failed": failed, } -def _predict(key_values, columns, estimator, propagate_exceptions, ns): +def _predict(key_values, schema, estimator, propagate_exceptions, ns): failed = False try: Y_values = estimator.predict(key_values) @@ -452,14 +462,15 @@ def _predict(key_values, columns, estimator, propagate_exceptions, ns): if failed: predictions = None else: + # TODO use schema predictions = ns.make_dataframe( - dict(zip(columns, np.atleast_2d(Y_values.T).T.T)) + dict(zip(schema.keys(), np.atleast_2d(Y_values.T).T.T)) ) return { "predictions": predictions, "failed": failed, - "columns": columns, - "shape": (key_values.shape[0], len(columns)), + "schema": schema, + "shape": (key_values.shape[0], len(schema)), } From 3c120b6b1c4f95febd5d323d8590e64aec18d7aa Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Wed, 15 Nov 2023 15:13:30 +0100 Subject: [PATCH 03/17] failed results with right schema --- skrub/_dataframe/__init__.py | 5 ++- skrub/_dataframe/_common.py | 11 +++++ skrub/_dataframe/_namespace.py | 5 +++ skrub/_interpolation_joiner.py | 73 +++++++++++++++------------------- 4 files changed, 50 insertions(+), 44 deletions(-) diff --git a/skrub/_dataframe/__init__.py b/skrub/_dataframe/__init__.py index 6dec0a176..3ad5b14d7 100644 --- a/skrub/_dataframe/__init__.py +++ b/skrub/_dataframe/__init__.py @@ -1,3 +1,4 @@ -from ._namespace import get_df_namespace +from ._common import std, stdns +from ._namespace import get_df_namespace, skrubns -__all__ = ["get_df_namespace"] +__all__ = ["get_df_namespace", "skrubns", "std", "stdns"] diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py index 8d7c6a662..814592238 100644 --- a/skrub/_dataframe/_common.py +++ b/skrub/_dataframe/_common.py @@ -7,3 +7,14 @@ class Selector(enum.Enum): NUMERIC = enum.auto() CATEGORICAL = enum.auto() STRING = enum.auto() + + +def std(obj): + try: + return obj.__dataframe_consortium_standard__() + except AttributeError: + return obj.__column_consortium_standard__() + + +def stdns(dataframe): + return dataframe.__dataframe_consortium_standard__().__dataframe_namespace__() diff --git a/skrub/_dataframe/_namespace.py b/skrub/_dataframe/_namespace.py index 06c65a2ea..b6c3f6972 100644 --- a/skrub/_dataframe/_namespace.py +++ b/skrub/_dataframe/_namespace.py @@ -97,3 +97,8 @@ def get_df_namespace(*dfs): "Only Pandas or Polars dataframes are currently supported, " f"got {modules=!r}." ) + + +def skrubns(*dataframes): + ns, _ = get_df_namespace(*dataframes) + return ns diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py index 1aa986b06..04aff393d 100644 --- a/skrub/_interpolation_joiner.py +++ b/skrub/_interpolation_joiner.py @@ -1,7 +1,7 @@ +import itertools import warnings import joblib -import numpy as np from sklearn.base import BaseEstimator, TransformerMixin, clone from sklearn.ensemble import ( HistGradientBoostingClassifier, @@ -10,7 +10,7 @@ from sklearn.utils._tags import _safe_tags from skrub import _join_utils, _utils -from skrub._dataframe import get_df_namespace +from skrub._dataframe import skrubns, std, stdns from skrub._minhash_encoder import MinHashEncoder from skrub._table_vectorizer import TableVectorizer @@ -221,9 +221,9 @@ def fit(self, X, y=None): self._check_inputs() if X is not None: _join_utils.check_missing_columns(X, self._main_key, "'X' (the main table)") - aux_table = self.aux_table.__dataframe_consortium_standard__() + aux_table = std(self.aux_table) # TODO avoid conversion to pandas when TableVectorizer supports it - ns, _ = get_df_namespace(self.aux_table) + ns = skrubns(self.aux_table) df = ns.to_pandas(aux_table.select(*self._aux_key).dataframe) key_values = self.vectorizer_.fit_transform(df) estimators = self._get_estimator_assignments() @@ -285,7 +285,7 @@ def transform(self, X): The result of the join between `X` and inferred rows from ``self.aux_table``. """ - main_table = X.__dataframe_consortium_standard__() + main_table = std(X) _join_utils.check_missing_columns( main_table.dataframe, self._main_key, "'X' (the main table)" ) @@ -295,8 +295,7 @@ def transform(self, X): .dataframe ) # TODO avoid conversion to pandas when vectorizer supports it - ns, _ = get_df_namespace(df) - df = ns.to_pandas(df) + df = skrubns(df).to_pandas(df) key_values = self.vectorizer_.transform(df) prediction_results = joblib.Parallel(self.n_jobs)( joblib.delayed(_predict)( @@ -304,36 +303,29 @@ def transform(self, X): assignment["schema"], assignment["estimator"], propagate_exceptions=(self.on_estimator_failure == "raise"), - ns=get_df_namespace(self.aux_table)[0], + api_ns=stdns(self.aux_table), ) for assignment in self.estimators_ ) prediction_results = self._check_prediction_results(prediction_results) predictions = [res["predictions"] for res in prediction_results] predictions = _add_column_name_suffix(predictions, self.suffix) - ns, _ = get_df_namespace(self.aux_table) - return ns.concatenate(main_table.dataframe, *predictions) + return skrubns(self.aux_table).concatenate(main_table.dataframe, *predictions) def _check_prediction_results(self, results): checked_results = [] failed_columns = [] - ns, _ = get_df_namespace(self.aux_table) + api_ns = stdns(self.aux_table) for res in results: new_res = dict(**res) if res["failed"]: - # TODO - # numeric_cols = list( - # ns.select(self.aux_table, ns.Selector.NUMERIC).columns - # ) - # if set(res["columns"]).issubset(numeric_cols): - # dtype = float - # else: - # dtype = object - pred = ns.make_dataframe( - { - c: [None for _ in range(res["shape"][0])] - for c in res["schema"].keys() - }, + pred = api_ns.dataframe_from_columns( + *[ + api_ns.column_from_sequence( + itertools.repeat(None, res["shape"][0]), name=c, dtype=dt + ) + for c, dt in res["schema"].items() + ], ) new_res["predictions"] = pred failed_columns.extend(res["schema"].keys()) @@ -365,10 +357,8 @@ def _get_estimator_assignments(self): When the estimator does not handle multi-output, an estimator is fitted separately to each column. """ - aux_table = self.aux_table.__dataframe_consortium_standard__().drop_columns( - *self._aux_key - ) - ns, _ = get_df_namespace(aux_table.dataframe) + aux_table = std(self.aux_table).drop_columns(*self._aux_key) + ns = skrubns(aux_table.dataframe) assignments = [] regression_table = ns.select(aux_table.dataframe, ns.Selector.NUMERIC) assignments.extend( @@ -390,7 +380,7 @@ def _get_assignments_for_estimator(table, estimator): # estimator is empty (eg the estimator is the regressor and there are no # numerical columns), return an empty list -- no columns are assigned to # that estimator. - table = table.__dataframe_consortium_standard__() + table = std(table) if not len(table.column_names): return [] if not _handles_multioutput(estimator): @@ -420,14 +410,10 @@ def _handles_multioutput(estimator): def _fit(key_values, target_table, estimator, propagate_exceptions): - target_table = target_table.__dataframe_consortium_standard__() + target_table = std(target_table) estimator = clone(estimator) - ns, _ = get_df_namespace(target_table.dataframe) - kept_rows = ~( - ns.any_rowwise(target_table.is_null().dataframe) - .__column_consortium_standard__() - .to_array() - ) + ns = skrubns(target_table.dataframe) + kept_rows = ~(std(ns.any_rowwise(target_table.is_null().dataframe)).to_array()) key_values = key_values[kept_rows] target_table = target_table.persist() Y = target_table.to_array(None)[kept_rows] @@ -451,7 +437,7 @@ def _fit(key_values, target_table, estimator, propagate_exceptions): } -def _predict(key_values, schema, estimator, propagate_exceptions, ns): +def _predict(key_values, schema, estimator, propagate_exceptions, api_ns): failed = False try: Y_values = estimator.predict(key_values) @@ -462,10 +448,13 @@ def _predict(key_values, schema, estimator, propagate_exceptions, ns): if failed: predictions = None else: - # TODO use schema - predictions = ns.make_dataframe( - dict(zip(schema.keys(), np.atleast_2d(Y_values.T).T.T)) - ) + if Y_values.ndim == 1: + Y_values = Y_values[:, None] + cols = [ + api_ns.column_from_1d_array(y.astype(type(y[0])), name=c, dtype=dt) + for y, (c, dt) in zip(Y_values.T, schema.items()) + ] + predictions = api_ns.dataframe_from_columns(*cols).dataframe return { "predictions": predictions, "failed": failed, @@ -479,7 +468,7 @@ def _add_column_name_suffix(dataframes, suffix): return dataframes renamed = [] for df in dataframes: - df = df.__dataframe_consortium_standard__() + df = std(df) renamed.append( df.rename_columns({c: f"{c}{suffix}" for c in df.column_names}).dataframe ) From 9e5630564889da8dc9d98bfb5c29a0814e112734 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Wed, 15 Nov 2023 15:23:02 +0100 Subject: [PATCH 04/17] fix preserving pandas index --- skrub/_interpolation_joiner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py index 04aff393d..d02f6900e 100644 --- a/skrub/_interpolation_joiner.py +++ b/skrub/_interpolation_joiner.py @@ -310,7 +310,7 @@ def transform(self, X): prediction_results = self._check_prediction_results(prediction_results) predictions = [res["predictions"] for res in prediction_results] predictions = _add_column_name_suffix(predictions, self.suffix) - return skrubns(self.aux_table).concatenate(main_table.dataframe, *predictions) + return skrubns(self.aux_table).concatenate(X, *predictions) def _check_prediction_results(self, results): checked_results = [] @@ -326,7 +326,7 @@ def _check_prediction_results(self, results): ) for c, dt in res["schema"].items() ], - ) + ).dataframe new_res["predictions"] = pred failed_columns.extend(res["schema"].keys()) checked_results.append(new_res) From b0b7c8990b1b95f74b0b619f2d06b37ef416a382 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Wed, 15 Nov 2023 15:44:42 +0100 Subject: [PATCH 05/17] update tests --- skrub/tests/test_interpolation_join.py | 93 ++++++++++++++------------ skrub/tests/test_select_cols.py | 13 +--- 2 files changed, 54 insertions(+), 52 deletions(-) diff --git a/skrub/tests/test_interpolation_join.py b/skrub/tests/test_interpolation_join.py index feee706c8..a73211d95 100644 --- a/skrub/tests/test_interpolation_join.py +++ b/skrub/tests/test_interpolation_join.py @@ -9,15 +9,15 @@ @pytest.fixture -def buildings(): - return pd.DataFrame( +def buildings(px): + return px.DataFrame( {"latitude": [1.0, 2.0], "longitude": [1.0, 2.0], "n_stories": [3, 7]} ) @pytest.fixture -def weather(): - return pd.DataFrame( +def weather(px): + return px.DataFrame( { "latitude": [1.2, 0.9, 1.9, 1.7, 5.0, 5.0], "longitude": [0.8, 1.1, 1.8, 1.8, 5.0, 5.0], @@ -31,20 +31,21 @@ def weather(): @pytest.mark.parametrize("with_nulls", [False, True]) def test_interpolation_join(buildings, weather, key, with_nulls): if not with_nulls: - weather = weather.fillna(0.0) + weather = weather.__dataframe_consortium_standard__().fill_null(0.0).dataframe transformed = InterpolationJoiner( weather, key=key, regressor=KNeighborsRegressor(2), classifier=KNeighborsClassifier(2), + suffix="_w", ).fit_transform(buildings) - assert_array_equal(transformed["avg_temp"].values, [10.5, 15.5]) - assert_array_equal(transformed["climate"].values, ["A", "B"]) + assert_array_equal(transformed["avg_temp_w"].to_numpy(), [10.5, 15.5]) + assert_array_equal(transformed["climate_w"].to_numpy(), ["A", "B"]) -def test_vectorizer(): - main = pd.DataFrame({"A": [0, 1]}) - aux = pd.DataFrame({"A": [11, 110], "B": [1, 0]}) +def test_vectorizer(px): + main = px.DataFrame({"A": [0, 1]}) + aux = px.DataFrame({"A": [11, 110], "B": [1, 0]}) class Vectorizer(TransformerMixin, BaseEstimator): def fit(self, X): @@ -71,18 +72,18 @@ def test_no_multioutput(buildings, weather): assert transformed.shape == (2, 5) -def test_condition_choice(): - main = pd.DataFrame({"A": [0, 1, 2]}) - aux = pd.DataFrame({"A": [0, 1, 2], "rB": [2, 0, 1], "C": [10, 11, 12]}) +def test_condition_choice(px): + main = px.DataFrame({"A": [0, 1, 2]}) + aux = px.DataFrame({"A": [0, 1, 2], "rB": [2, 0, 1], "C": [10, 11, 12]}) join = InterpolationJoiner( aux, key="A", regressor=KNeighborsRegressor(1) ).fit_transform(main) - assert_array_equal(join["C"].values, [10, 11, 12]) + assert_array_equal(join["C"].to_numpy(), [10, 11, 12]) join = InterpolationJoiner( - aux, main_key="A", aux_key="rB", regressor=KNeighborsRegressor(1) + aux, main_key="A", aux_key="rB", regressor=KNeighborsRegressor(1), suffix="_r" ).fit_transform(main) - assert_array_equal(join["C"].values, [11, 12, 10]) + assert_array_equal(join["C_r"].to_numpy(), [11, 12, 10]) with pytest.raises(ValueError, match="Must pass EITHER"): join = InterpolationJoiner( @@ -100,8 +101,8 @@ def test_condition_choice(): ).fit(None) -def test_suffix(): - df = pd.DataFrame({"A": [0, 1], "B": [0, 1]}) +def test_suffix(px): + df = px.DataFrame({"A": [0, 1], "B": [0, 1]}) join = InterpolationJoiner( df, key="A", suffix="_aux", regressor=KNeighborsRegressor(1) ).fit_transform(df) @@ -114,25 +115,28 @@ def test_mismatched_indexes(): join = InterpolationJoiner( aux, key="A", regressor=KNeighborsRegressor(1) ).fit_transform(main) - assert_array_equal(join["B"].values, [10, 11]) - assert_array_equal(join.index.values, [1, 0]) + assert_array_equal(join["B"].to_numpy(), [10, 11]) + assert_array_equal(join.index.to_numpy(), [1, 0]) -def test_fit_on_none(): +def test_fit_on_none(px): # X is hardly used in fit so it should be ok to fit without a main table - aux = pd.DataFrame({"A": [0, 1], "B": [10, 11]}) + aux = px.DataFrame({"A": [0, 1], "B": [10, 11]}) joiner = InterpolationJoiner(aux, key="A", regressor=KNeighborsRegressor(1)).fit( None ) - main = pd.DataFrame({"A": [0, 1]}, index=[1, 0]) + main = px.DataFrame({"A": [0, 1]}) + if isinstance(main, pd.DataFrame): + main.index = [1, 0] join = joiner.transform(main) - assert_array_equal(join["B"].values, [10, 11]) - assert_array_equal(join.index.values, [1, 0]) + assert_array_equal(join["B"].to_numpy(), [10, 11]) + if isinstance(main, pd.DataFrame): + assert_array_equal(join.index.to_numpy(), [1, 0]) -def test_join_on_date(): - sales = pd.DataFrame({"date": ["2023-09-20", "2023-09-29"], "n": [10, 15]}) - temp = pd.DataFrame( +def test_join_on_date(px): + sales = px.DataFrame({"date": ["2023-09-20", "2023-09-29"], "n": [10, 15]}) + temp = px.DataFrame( {"date": ["2023-09-09", "2023-10-01", "2024-09-21"], "temp": [-10, 10, 30]} ) transformed = ( @@ -145,7 +149,7 @@ def test_join_on_date(): .set_params(vectorizer__datetime_transformer__resolution=None) .fit_transform(sales) ) - assert_array_equal(transformed["temp"].values, [-10, 10]) + assert_array_equal(transformed["temp"].to_numpy(), [-10, 10]) class FailFit(DummyClassifier): @@ -154,7 +158,12 @@ def fit(self, X, y): def test_fit_failures(buildings, weather): - weather["climate"] = "A" + try: + weather["climate"] = "A" + except TypeError: + import polars as pl + + weather = weather.with_columns(pl.lit("A").alias("climate")) joiner = InterpolationJoiner( weather, key=["latitude", "longitude"], @@ -163,7 +172,7 @@ def test_fit_failures(buildings, weather): on_estimator_failure="pass", ) join = joiner.fit_transform(buildings) - assert_array_equal(join["avg_temp"].values, [10.5, 15.5]) + assert_array_equal(join["avg_temp"].to_numpy(), [10.5, 15.5]) assert join.shape == (2, 4) joiner = InterpolationJoiner( @@ -175,7 +184,7 @@ def test_fit_failures(buildings, weather): ) with pytest.warns(UserWarning, match="(?s)Estimators failed.*climate"): join = joiner.fit_transform(buildings) - assert_array_equal(join["avg_temp"].values, [10.5, 15.5]) + assert_array_equal(join["avg_temp"].to_numpy(), [10.5, 15.5]) assert join.shape == (2, 4) joiner = InterpolationJoiner( @@ -203,9 +212,9 @@ def test_transform_failures(buildings, weather): on_estimator_failure="pass", ) join = joiner.fit_transform(buildings) - assert_array_equal(join["avg_temp"].values, [10.5, 15.5]) - assert join["climate"].isnull().all() - assert join["climate"].dtype == object + assert_array_equal(join["avg_temp"].to_numpy(), [10.5, 15.5]) + assert join["climate"].__column_consortium_standard__().is_null().all() + assert join["climate"].dtype == weather["climate"].dtype assert join.shape == (2, 5) joiner = InterpolationJoiner( @@ -217,9 +226,9 @@ def test_transform_failures(buildings, weather): ) with pytest.warns(UserWarning, match="(?s)Prediction failed.*climate"): join = joiner.fit_transform(buildings) - assert_array_equal(join["avg_temp"].values, [10.5, 15.5]) - assert join["climate"].isnull().all() - assert join["climate"].dtype == object + assert_array_equal(join["avg_temp"].to_numpy(), [10.5, 15.5]) + assert join["climate"].__column_consortium_standard__().is_null().all() + assert join["climate"].dtype == weather["climate"].dtype assert join.shape == (2, 5) joiner = InterpolationJoiner( @@ -242,8 +251,8 @@ def test_transform_failures_dtype(buildings, weather): on_estimator_failure="pass", ) join = joiner.fit_transform(buildings) - assert join["avg_temp"].isnull().all() - assert join["avg_temp"].dtype == "float64" + assert join["avg_temp"].__column_consortium_standard__().is_null().all() + assert join["avg_temp"].dtype == join["avg_temp"].dtype assert join.shape == (2, 5) joiner = InterpolationJoiner( @@ -254,6 +263,6 @@ def test_transform_failures_dtype(buildings, weather): on_estimator_failure="pass", ) join = joiner.fit_transform(buildings) - assert join["climate"].isnull().all() - assert join["climate"].dtype == object + assert join["climate"].__column_consortium_standard__().is_null().all() + assert join["climate"].dtype == weather["climate"].dtype assert join.shape == (2, 5) diff --git a/skrub/tests/test_select_cols.py b/skrub/tests/test_select_cols.py index 3ab07e590..4b08a255c 100644 --- a/skrub/tests/test_select_cols.py +++ b/skrub/tests/test_select_cols.py @@ -3,18 +3,11 @@ import pytest from skrub import DropCols, SelectCols -from skrub._dataframe._polars import POLARS_SETUP -DATAFRAME_MODULES = [pandas] -if POLARS_SETUP: - import polars - DATAFRAME_MODULES.append(polars) - - -@pytest.fixture(params=DATAFRAME_MODULES) -def df(request): - return request.param.DataFrame({"A": [1, 2], "B": [10, 20], "C": ["x", "y"]}) +@pytest.fixture +def df(px): + return px.DataFrame({"A": [1, 2], "B": [10, 20], "C": ["x", "y"]}) def test_select_cols(df): From adda6e45b54a075ad58adeec8a612d30a0ee0d10 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Wed, 15 Nov 2023 15:45:36 +0100 Subject: [PATCH 06/17] add conftest --- skrub/tests/conftest.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 skrub/tests/conftest.py diff --git a/skrub/tests/conftest.py b/skrub/tests/conftest.py new file mode 100644 index 000000000..d142f8b9c --- /dev/null +++ b/skrub/tests/conftest.py @@ -0,0 +1,15 @@ +import pandas +import pytest + +DATAFRAME_MODULES = [pandas] +try: + import polars + + DATAFRAME_MODULES.append(polars) +except ImportError: + pass + + +@pytest.fixture(params=DATAFRAME_MODULES) +def px(request): + return request.param From 668b8490a3aa332dc47ee3488b9145aefb2966fc Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Wed, 15 Nov 2023 15:53:05 +0100 Subject: [PATCH 07/17] add dataframe_api_compat --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 4abe5b61e..099531f2b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,6 +30,7 @@ install_requires = scipy>=1.9.3 pandas>=1.5.3 packaging>=23.1 + dataframe_api_compat>=0.1.28 python_requires = >=3.10 [options.extras_require] From 6729905bb8c9651708f34cff42278c5cbe5263c1 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Wed, 15 Nov 2023 16:09:41 +0100 Subject: [PATCH 08/17] update doctests --- setup.cfg | 2 +- skrub/_interpolation_joiner.py | 1 + skrub/_joiner.py | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 099531f2b..f128297c5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,7 +30,7 @@ install_requires = scipy>=1.9.3 pandas>=1.5.3 packaging>=23.1 - dataframe_api_compat>=0.1.28 + dataframe-api-compat>=0.1.28 python_requires = >=3.10 [options.extras_require] diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py index d02f6900e..151eecaa4 100644 --- a/skrub/_interpolation_joiner.py +++ b/skrub/_interpolation_joiner.py @@ -136,6 +136,7 @@ class InterpolationJoiner(TransformerMixin, BaseEstimator): Examples -------- + >>> import pandas as pd >>> buildings = pd.DataFrame( ... {"latitude": [1.0, 2.0], "longitude": [1.0, 2.0], "n_stories": [3, 7]} ... ) diff --git a/skrub/_joiner.py b/skrub/_joiner.py index 77a48b70d..c3b1c3f91 100644 --- a/skrub/_joiner.py +++ b/skrub/_joiner.py @@ -78,6 +78,7 @@ class Joiner(TransformerMixin, BaseEstimator): Examples -------- + >>> import pandas as pd >>> X = pd.DataFrame(['France', 'Germany', 'Italy'], columns=['Country']) >>> X Country From 9736d0d9b9ad878972f018b2785db0d5eb55ed11 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Thu, 16 Nov 2023 16:39:17 +0100 Subject: [PATCH 09/17] bump pandas version to have dataframe api support --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index f128297c5..4cc6490d1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,7 +28,7 @@ install_requires = scikit-learn>=1.2.1 numpy>=1.23.5 scipy>=1.9.3 - pandas>=1.5.3 + pandas>=2.1.0 packaging>=23.1 dataframe-api-compat>=0.1.28 python_requires = >=3.10 From 418f3f36a00b79bfca97252a26975c39365d3450 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Thu, 16 Nov 2023 16:43:14 +0100 Subject: [PATCH 10/17] update setup.cfg --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 4cc6490d1..a9a20c3fb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -80,7 +80,7 @@ min-py310 = scikit-learn==1.2.1 numpy==1.23.5 scipy==1.9.3 - pandas==1.5.3 + pandas==2.1.0 [flake8] # max line length for black From b5f2c5687fc576c4542e352c7df5b51f327fb884 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Thu, 16 Nov 2023 18:04:39 +0100 Subject: [PATCH 11/17] upgrade scikit-learn --- setup.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index a9a20c3fb..b7fc345f2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,7 +25,7 @@ project_urls = include_package_data = True packages = find: install_requires = - scikit-learn>=1.2.1 + scikit-learn>=1.3.0 numpy>=1.23.5 scipy>=1.9.3 pandas>=2.1.0 @@ -77,7 +77,7 @@ benchmarks = # Overwrite the previous install_requires for CI testing purposes # as defined in testing.yml. min-py310 = - scikit-learn==1.2.1 + scikit-learn==1.3.0 numpy==1.23.5 scipy==1.9.3 pandas==2.1.0 From b7b0a5904b478eff4f614105f2e803065ee1e2ba Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Thu, 16 Nov 2023 18:04:48 +0100 Subject: [PATCH 12/17] remove unused to_numpy --- skrub/_dataframe/_pandas.py | 5 ----- skrub/_dataframe/_polars.py | 7 ------- 2 files changed, 12 deletions(-) diff --git a/skrub/_dataframe/_pandas.py b/skrub/_dataframe/_pandas.py index 499bdd446..4c439a492 100644 --- a/skrub/_dataframe/_pandas.py +++ b/skrub/_dataframe/_pandas.py @@ -23,7 +23,6 @@ "concatenate", "any_rowwise", "collect", - "to_numpy", "to_pandas", ] @@ -384,7 +383,3 @@ def collect(dataframe): def to_pandas(dataframe): return dataframe - - -def to_numpy(dataframe): - return dataframe.to_numpy() diff --git a/skrub/_dataframe/_polars.py b/skrub/_dataframe/_polars.py index d17af0dd4..1ff2b5e96 100644 --- a/skrub/_dataframe/_polars.py +++ b/skrub/_dataframe/_polars.py @@ -28,7 +28,6 @@ "concatenate", "any_rowwise", "collect", - "to_numpy", "to_pandas", ] @@ -328,9 +327,3 @@ def to_pandas(dataframe): if hasattr(dataframe, "collect"): dataframe = dataframe.collect() return dataframe.to_pandas() - - -def to_numpy(dataframe): - if hasattr(dataframe, "to_numpy"): - return dataframe.to_numpy() - return dataframe.collect().to_numpy() From c8b24f3f92b6db7db3f5c8dd502e30006dbf8d5b Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Fri, 17 Nov 2023 11:22:29 +0100 Subject: [PATCH 13/17] add more tests --- .github/workflows/testing.yml | 1 + skrub/tests/conftest.py => conftest.py | 0 skrub/_dataframe/__init__.py | 4 +-- skrub/_dataframe/_common.py | 1 - skrub/_dataframe/_pandas.py | 9 +----- skrub/_dataframe/_polars.py | 9 ++---- skrub/_dataframe/tests/test_all.py | 43 ++++++++++++++++++++++++++ 7 files changed, 50 insertions(+), 17 deletions(-) rename skrub/tests/conftest.py => conftest.py (100%) create mode 100644 skrub/_dataframe/tests/test_all.py diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 5f047366d..787f86757 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -75,6 +75,7 @@ jobs: - shell: bash {0} run: | cp $GITHUB_WORKSPACE/pyproject.toml . + cp $GITHUB_WORKSPACE/conftest.py . $GITHUB_WORKSPACE/build_tools/github/test.sh working-directory: ${{ runner.temp }} name: 'Run tests' diff --git a/skrub/tests/conftest.py b/conftest.py similarity index 100% rename from skrub/tests/conftest.py rename to conftest.py diff --git a/skrub/_dataframe/__init__.py b/skrub/_dataframe/__init__.py index 3ad5b14d7..038e4ef1c 100644 --- a/skrub/_dataframe/__init__.py +++ b/skrub/_dataframe/__init__.py @@ -1,4 +1,4 @@ -from ._common import std, stdns +from ._common import Selector, std, stdns from ._namespace import get_df_namespace, skrubns -__all__ = ["get_df_namespace", "skrubns", "std", "stdns"] +__all__ = ["get_df_namespace", "skrubns", "std", "stdns", "Selector"] diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py index 814592238..bb95b61ba 100644 --- a/skrub/_dataframe/_common.py +++ b/skrub/_dataframe/_common.py @@ -6,7 +6,6 @@ class Selector(enum.Enum): NONE = enum.auto() NUMERIC = enum.auto() CATEGORICAL = enum.auto() - STRING = enum.auto() def std(obj): diff --git a/skrub/_dataframe/_pandas.py b/skrub/_dataframe/_pandas.py index 4c439a492..2faf940f2 100644 --- a/skrub/_dataframe/_pandas.py +++ b/skrub/_dataframe/_pandas.py @@ -22,7 +22,6 @@ "Selector", "concatenate", "any_rowwise", - "collect", "to_pandas", ] @@ -356,14 +355,12 @@ def select(dataframe, columns): return dataframe.select_dtypes("number") elif columns is Selector.CATEGORICAL: return dataframe.select_dtypes(["object", "string", "category"]) - elif columns is Selector.STRING: - return dataframe.select_dtypes(["string"]) # we have covered all items in the enumeration assert False def drop(dataframe, columns): - return dataframe.drop(select(dataframe, columns).columns.values) + return dataframe.drop(select(dataframe, columns).columns.values, axis=1) def any_rowwise(dataframe): @@ -377,9 +374,5 @@ def concatenate(dataframe, *other_dataframes): return pd.concat([dataframe] + list(other_dataframes), axis=1) -def collect(dataframe): - return dataframe - - def to_pandas(dataframe): return dataframe diff --git a/skrub/_dataframe/_polars.py b/skrub/_dataframe/_polars.py index 1ff2b5e96..e179d0b31 100644 --- a/skrub/_dataframe/_polars.py +++ b/skrub/_dataframe/_polars.py @@ -27,7 +27,6 @@ "Selector", "concatenate", "any_rowwise", - "collect", "to_pandas", ] @@ -292,8 +291,6 @@ def _check_selector(columns): return cs.numeric() elif columns is Selector.CATEGORICAL: return cs.string(include_categorical=True) - elif columns is Selector.STRING: - return cs.string() # we have covered all items in the enumeration assert False @@ -307,17 +304,17 @@ def drop(dataframe, columns): def any_rowwise(dataframe): - return collect(dataframe.select(pl.any_horizontal(pl.all()))).get_column("any") + return _collect(dataframe.select(pl.any_horizontal(pl.all()))).get_column("any") def concatenate(dataframe, *other_dataframes): return pl.concat( - [collect(dataframe)] + [collect(df) for df in other_dataframes], + [_collect(dataframe)] + [_collect(df) for df in other_dataframes], how="horizontal", ) -def collect(dataframe): +def _collect(dataframe): if hasattr(dataframe, "collect"): dataframe = dataframe.collect() return dataframe diff --git a/skrub/_dataframe/tests/test_all.py b/skrub/_dataframe/tests/test_all.py new file mode 100644 index 000000000..d46140968 --- /dev/null +++ b/skrub/_dataframe/tests/test_all.py @@ -0,0 +1,43 @@ +import pytest + +from skrub._dataframe import Selector, skrubns + + +@pytest.fixture +def df(px): + return px.DataFrame( + {"ID": [2, 3, 7], "name": ["ab", "cd", "01"], "temp": [20.3, 40.9, 11.5]} + ) + + +def test_select(df): + ns = skrubns(df) + assert list(ns.select(df, []).columns) == [] + assert list(ns.select(df, ["name"]).columns) == ["name"] + assert list(ns.select(df, Selector.ALL).columns) == list(df.columns) + assert list(ns.select(df, Selector.NONE).columns) == [] + assert list(ns.select(df, Selector.NUMERIC).columns) == ["ID", "temp"] + assert list(ns.select(df, Selector.CATEGORICAL).columns) == ["name"] + + +def test_drop(df): + ns = skrubns(df) + assert list(ns.drop(df, []).columns) == list(df.columns) + assert list(ns.drop(df, ["name"]).columns) == ["ID", "temp"] + assert list(ns.drop(df, Selector.ALL).columns) == [] + assert list(ns.drop(df, Selector.NONE).columns) == list(df.columns) + assert list(ns.drop(df, Selector.NUMERIC).columns) == ["name"] + assert list(ns.drop(df, Selector.CATEGORICAL).columns) == ["ID", "temp"] + + +def test_concatenate(df): + ns = skrubns(df) + df1 = ( + df.__dataframe_consortium_standard__() + .rename_columns({c: f"{c}_1" for c in df.columns}) + .dataframe + ) + out = ns.concatenate(df) + assert list(out.columns) == list(df.columns) + out = ns.concatenate(df, df1) + assert list(out.columns) == list(df.columns) + list(df1.columns) From a9ac96da20c6c990305ca106fdb86a50f12c09f9 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Fri, 17 Nov 2023 11:23:39 +0100 Subject: [PATCH 14/17] update changelog --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index dd6726821..caf4351bf 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -17,7 +17,7 @@ Major changes * :class:`InterpolationJoiner` was added to join two tables by using machine-learning to infer the matching rows from the second table. - :pr:`742` by :user:`Jérôme Dockès `. + :pr:`742` and :pr:`827` by :user:`Jérôme Dockès `. * Pipelines including :class:`TableVectorizer` can now be grid-searched, since we can now call `set_params` on the default transformers of :class:`TableVectorizer`. From 2e6e19cdeb200270a7a4d084be7c4584ad07d860 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Fri, 17 Nov 2023 11:28:11 +0100 Subject: [PATCH 15/17] column namespace --- skrub/_dataframe/_common.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py index bb95b61ba..78edd4e5a 100644 --- a/skrub/_dataframe/_common.py +++ b/skrub/_dataframe/_common.py @@ -15,5 +15,8 @@ def std(obj): return obj.__column_consortium_standard__() -def stdns(dataframe): - return dataframe.__dataframe_consortium_standard__().__dataframe_namespace__() +def stdns(obj): + try: + return obj.__dataframe_consortium_standard__().__dataframe_namespace__() + except AttributeError: + return obj.__column_consortium_standard__().__column_namespace__() From 1be7187abe8ab95b67a01baf596495351a1c4d4f Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Fri, 17 Nov 2023 11:47:28 +0100 Subject: [PATCH 16/17] more tests --- skrub/_dataframe/tests/test_common.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 skrub/_dataframe/tests/test_common.py diff --git a/skrub/_dataframe/tests/test_common.py b/skrub/_dataframe/tests/test_common.py new file mode 100644 index 000000000..182a664e8 --- /dev/null +++ b/skrub/_dataframe/tests/test_common.py @@ -0,0 +1,11 @@ +from skrub._dataframe import skrubns, std, stdns + + +def test_std(px): + df = px.DataFrame({"A": [1, 2]}) + assert hasattr(std(df), "dataframe") + assert hasattr(stdns(df), "dataframe_from_columns") + ns = skrubns(df) + s = ns.make_series([1, 2], name="A") + assert hasattr(std(s), "column") + assert hasattr(stdns(s), "dataframe_from_columns") From c09f577a1e766e2657d94cf277ab6645a0e6ffb1 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Fri, 17 Nov 2023 11:58:25 +0100 Subject: [PATCH 17/17] rename concatenate -> concat_horizontal --- skrub/_dataframe/_pandas.py | 4 ++-- skrub/_dataframe/_polars.py | 4 ++-- skrub/_dataframe/tests/test_all.py | 6 +++--- skrub/_interpolation_joiner.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/skrub/_dataframe/_pandas.py b/skrub/_dataframe/_pandas.py index 2faf940f2..c64ebdff6 100644 --- a/skrub/_dataframe/_pandas.py +++ b/skrub/_dataframe/_pandas.py @@ -20,7 +20,7 @@ "select", "drop", "Selector", - "concatenate", + "concat_horizontal", "any_rowwise", "to_pandas", ] @@ -367,7 +367,7 @@ def any_rowwise(dataframe): return dataframe.any(axis=1) -def concatenate(dataframe, *other_dataframes): +def concat_horizontal(dataframe, *other_dataframes): other_dataframes = [ df.set_axis(dataframe.index, axis="index") for df in other_dataframes ] diff --git a/skrub/_dataframe/_polars.py b/skrub/_dataframe/_polars.py index e179d0b31..1b52ee6ea 100644 --- a/skrub/_dataframe/_polars.py +++ b/skrub/_dataframe/_polars.py @@ -25,7 +25,7 @@ "select", "drop", "Selector", - "concatenate", + "concat_horizontal", "any_rowwise", "to_pandas", ] @@ -307,7 +307,7 @@ def any_rowwise(dataframe): return _collect(dataframe.select(pl.any_horizontal(pl.all()))).get_column("any") -def concatenate(dataframe, *other_dataframes): +def concat_horizontal(dataframe, *other_dataframes): return pl.concat( [_collect(dataframe)] + [_collect(df) for df in other_dataframes], how="horizontal", diff --git a/skrub/_dataframe/tests/test_all.py b/skrub/_dataframe/tests/test_all.py index d46140968..8c18fdbc5 100644 --- a/skrub/_dataframe/tests/test_all.py +++ b/skrub/_dataframe/tests/test_all.py @@ -30,14 +30,14 @@ def test_drop(df): assert list(ns.drop(df, Selector.CATEGORICAL).columns) == ["ID", "temp"] -def test_concatenate(df): +def test_concat_horizontal(df): ns = skrubns(df) df1 = ( df.__dataframe_consortium_standard__() .rename_columns({c: f"{c}_1" for c in df.columns}) .dataframe ) - out = ns.concatenate(df) + out = ns.concat_horizontal(df) assert list(out.columns) == list(df.columns) - out = ns.concatenate(df, df1) + out = ns.concat_horizontal(df, df1) assert list(out.columns) == list(df.columns) + list(df1.columns) diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py index 151eecaa4..f696dd4df 100644 --- a/skrub/_interpolation_joiner.py +++ b/skrub/_interpolation_joiner.py @@ -311,7 +311,7 @@ def transform(self, X): prediction_results = self._check_prediction_results(prediction_results) predictions = [res["predictions"] for res in prediction_results] predictions = _add_column_name_suffix(predictions, self.suffix) - return skrubns(self.aux_table).concatenate(X, *predictions) + return skrubns(self.aux_table).concat_horizontal(X, *predictions) def _check_prediction_results(self, results): checked_results = []