skrub-data · rcap107 · Nov 21, 2024 · Nov 25, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -14,7 +14,9 @@ It is currently undergoing fast development and backward compatibility is not en
 
 New features
 ------------
-
+* The :class:`StringEncoder` encodes strings using tf-idf and truncated SVD
+  decomposition and provides a cheaper alternative to :class:`TextEncoder`.
+  :pr:`1159` by :user:`Riccardo Cappuzzo<rcap107>`.
 
 Changes
 -------

diff --git a/example_string_encoder.py b/example_string_encoder.py
@@ -0,0 +1,33 @@
+# %% test string encoder
+import polars as pl
+from sklearn.decomposition import PCA
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import Pipeline
+
+from skrub._string_encoder import StringEncoder
+
+corpus = [
+    "this is the first document",
+    "this document is the second document",
+    "and this is the third one",
+    "is this the first document",
+]
+column = pl.Series(name="this_column", values=corpus)
+
+# %%
+
+pipe = Pipeline(
+    [
+        ("tfidf", TfidfVectorizer()),
+        ("pca", PCA(n_components=2)),
+    ]
+)
+# %%
+a = pipe.fit_transform(corpus)
+
+# %%
+se = StringEncoder(2)
+
+# %%
+r = se.fit_transform(column)
+# %%
diff --git a/examples/02_text_with_string_encoders.py b/examples/02_text_with_string_encoders.py
@@ -17,6 +17,9 @@
 .. |TextEncoder| replace::
      :class:`~skrub.TextEncoder`
 
+.. |StringEncoder| replace::
+     :class:`~skrub.StringEncoder`
+
 .. |TableReport| replace::
      :class:`~skrub.TableReport`
 
@@ -132,7 +135,7 @@ def plot_gap_feature_importance(X_trans):
 # We set ``n_components`` to 30; however, to achieve the best performance, we would
 # need to find the optimal value for this hyperparameter using either |GridSearchCV|
 # or |RandomizedSearchCV|. We skip this part to keep the computation time for this
-# example small.
+# small example.
 #
 # Recall that the ROC AUC is a metric that quantifies the ranking power of estimators,
 # where a random estimator scores 0.5, and an oracle —providing perfect predictions—
@@ -221,6 +224,25 @@ def plot_box_results(named_results):
 
 plot_box_results(results)
 
+# %%
+# |TextEncoder| embeddings are very strong, but they are also quite expensive to
+# train. A simpler, faster alternative for encoding strings is the |StringEncoder|,
+# which works by first performing a tf-idf vectorization of the text, and then
+# following it with TruncatedSVD to reduce the number of dimensions to, in this
+# case, 30.
+from skrub import StringEncoder
+
+string_encoder = StringEncoder(n_components=30)
+
+string_encoder_pipe = clone(gap_pipe).set_params(
+    **{"tablevectorizer__high_cardinality": string_encoder}
+)
+string_encoder_results = cross_validate(string_encoder_pipe, X, y, scoring="roc_auc")
+results.append(("StringEncoder", string_encoder_results))
+
+plot_box_results(results)
+
+
 # %%
 # The performance of the |TextEncoder| is significantly stronger than that of
 # the syntactic encoders, which is expected. But how long does it take to load
@@ -232,7 +254,7 @@ def plot_box_results(named_results):
 
 def plot_performance_tradeoff(results):
     fig, ax = plt.subplots(figsize=(5, 4), dpi=200)
-    markers = ["s", "o", "^"]
+    markers = ["s", "o", "^", "x"]
     for idx, (name, result) in enumerate(results):
         ax.scatter(
             result["fit_time"],
@@ -293,8 +315,11 @@ def plot_performance_tradeoff(results):
 # During the subsequent cross-validation iterations, the model is simply copied,
 # which reduces computation time for the remaining folds.
 #
+# Interestingly, |StringEncoder| has a performance remarkably similar to that of
+# |GapEncoder|, while being significantly faster.
 # Conclusion
 # ----------
 # In conclusion, |TextEncoder| provides powerful vectorization for text, but at
 # the cost of longer computation times and the need for additional dependencies,
-# such as torch.
+# such as torch. \StringEncoder| represents a simpler alternative that can provide
+# good performance at a fraction of the cost of more complex methods.
diff --git a/skrub/__init__.py b/skrub/__init__.py
@@ -17,6 +17,7 @@
 from ._reporting import TableReport, patch_display, unpatch_display
 from ._select_cols import DropCols, SelectCols
 from ._similarity_encoder import SimilarityEncoder
+from ._string_encoder import StringEncoder
 from ._table_vectorizer import TableVectorizer
 from ._tabular_learner import tabular_learner
 from ._text_encoder import TextEncoder
@@ -53,5 +54,6 @@
     "SelectCols",
     "DropCols",
     "TextEncoder",
+    "StringEncoder",
     "column_associations",
 ]
diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py
@@ -0,0 +1,132 @@
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import Pipeline
+from sklearn.utils.validation import check_is_fitted
+
+from . import _dataframe as sbd
+from ._on_each_column import SingleColumnTransformer
+
+
+class StringEncoder(SingleColumnTransformer):
+    """Generate a lightweight string encoding of a given column using tf-idf \
+        vectorization and truncated SVD.
+
+    First, apply a tf-idf vectorization of the text, then reduce the dimensionality
+    with a truncated SVD decomposition with the given number of parameters.
+
+    New features will be named `{col_name}_{component}` if the series has a name,
+    and `tsvd_{component}` if it does not.
+
+    Parameters
+    ----------
+    n_components : int
+        Number of components to be used for the PCA decomposition.
+
+    See Also
+    --------
+    MinHashEncoder :
+        Encode string columns as a numeric array with the minhash method.
+    GapEncoder :
+        Encode string columns by constructing latent topics.
+    SimilarityEncoder :
+        Encode string columns as a numeric array with n-gram string similarity.
+    TextEncoder :
+        Encode string columns using pre-trained language models.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> from skrub import StringEncoder
+
+    We will encode the comments using 2 components:
+
+    >>> enc = StringEncoder(n_components=2)
+    >>> X = pd.Series([
+    ...   "The professor snatched a good interview out of the jaws of these questions.",
+    ...   "Bookmarking this to watch later.",
+    ...   "When you don't know the lyrics of the song except the chorus",
+    ... ], name='video comments')
+
+    >>> enc.fit_transform(X) # doctest: +SKIP
+       video comments_0  video comments_1
+    0      8.218069e-01      4.557474e-17
+    1      6.971618e-16      1.000000e+00
+    2      8.218069e-01     -3.046564e-16
+    """
+
+    def __init__(self, n_components=30):
+        self.n_components = n_components
+
+    def _transform(self, X):
+        result = self.pipe.transform(sbd.to_numpy(X))
+
+        result = sbd.make_dataframe_like(X, dict(zip(self.all_outputs_, result.T)))
+        result = sbd.copy_index(X, result)
+
+        return result
+
+    def get_feature_names_out(self):
+        """Get output feature names for transformation.
+
+        Returns
+        -------
+        feature_names_out : list of str objects
+            Transformed feature names.
+        """
+        return list(self.all_outputs_)
+
+    def fit_transform(self, X, y=None):
+        """Fit the encoder and transform a column.
+
+        Parameters
+        ----------
+        X : Pandas or Polars series
+            The column to transform.
+        y : None
+            Unused. Here for compatibility with scikit-learn.
+
+        Returns
+        -------
+        X_out: Pandas or Polars dataframe with shape (len(X), tsvd_n_components)
+            The embedding representation of the input.
+        """
+        del y
+        self.pipe = Pipeline(
+            [
+                ("tfidf", TfidfVectorizer()),
+                ("tsvd", TruncatedSVD(n_components=self.n_components)),
+            ]
+        )
+
+        name = sbd.name(X)
+        if not name:
+            name = "tsvd"
+        self.all_outputs_ = [f"{name}_{idx}" for idx in range(self.n_components)]
+
+        self.pipe.fit(sbd.to_numpy(X))
+
+        self._is_fitted = True
+
+        return self.transform(X)
+
+    def transform(self, X):
+        """Transform a column.
+
+        Parameters
+        ----------
+        X : Pandas or Polars series
+            The column to transform.
+
+        Returns
+        -------
+        X_out: Pandas or Polars dataframe with shape (len(X), tsvd_n_components)
+            The embedding representation of the input.
+        """
+        check_is_fitted(self)
+        return self._transform(X)
+
+    def __sklearn_is_fitted__(self):
+        """
+        Check fitted status and return a Boolean value.
+        """
+        return hasattr(self, "_is_fitted") and self._is_fitted
diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py
@@ -0,0 +1,68 @@
+import pytest
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import Pipeline
+
+from skrub import _dataframe as sbd
+from skrub._string_encoder import StringEncoder
+
+
+@pytest.fixture
+def encode_column(df_module):
+    corpus = [
+        "this is the first document",
+        "this document is the second document",
+        "and this is the third one",
+        "is this the first document",
+    ]
+
+    return df_module.make_column("col1", corpus)
+
+
+def test_encoding(encode_column, df_module):
+    pipe = Pipeline(
+        [
+            ("tfidf", TfidfVectorizer()),
+            ("tsvd", TruncatedSVD(n_components=2)),
+        ]
+    )
+    check = pipe.fit_transform(sbd.to_numpy(encode_column))
+
+    names = [f"col1_{idx}" for idx in range(2)]
+
+    check_df = df_module.make_dataframe(dict(zip(names, check.T)))
+
+    se = StringEncoder(2)
+    result = se.fit_transform(encode_column)
+
+    # Converting dtypes to avoid nullable shenanigans
+    check_df = sbd.pandas_convert_dtypes(check_df)
+    result = sbd.pandas_convert_dtypes(result)
+
+    df_module.assert_frame_equal(check_df, result)
+
+
+def test_get_feature_names_out(encode_column, df_module):
+    """Test that ``get_feature_names_out`` returns the correct feature names."""
+    encoder = StringEncoder(n_components=4)
+
+    encoder.fit(encode_column)
+    expected_columns = ["col1_0", "col1_1", "col1_2", "col1_3"]
+    assert encoder.get_feature_names_out() == expected_columns
+
+    # Checking that a series with an empty name generates the proper column names
+    X = df_module.make_column(
+        None,
+        [
+            "this is the first document",
+            "this document is the second document",
+            "and this is the third one",
+            "is this the first document",
+        ],
+    )
+
+    encoder = StringEncoder(n_components=4)
+
+    encoder.fit(X)
+    expected_columns = ["tsvd_0", "tsvd_1", "tsvd_2", "tsvd_3"]
+    assert encoder.get_feature_names_out() == expected_columns