skrub-data · rcap107 · Nov 21, 2024 · Nov 25, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -17,6 +17,9 @@ Release 0.4.1
 
 New features
 ------------
+* The :class:`StringEncoder` encodes strings using tf-idf and truncated SVD
+  decomposition and provides a cheaper alternative to :class:`GapEncoder`.
+  :pr:`1159` by :user:`Riccardo Cappuzzo<rcap107>`.
 
 Changes
 -------

diff --git a/examples/02_text_with_string_encoders.py b/examples/02_text_with_string_encoders.py
@@ -17,6 +17,9 @@
 .. |TextEncoder| replace::
      :class:`~skrub.TextEncoder`
 
+.. |StringEncoder| replace::
+     :class:`~skrub.StringEncoder`
+
 .. |TableReport| replace::
      :class:`~skrub.TableReport`
 
@@ -132,7 +135,7 @@ def plot_gap_feature_importance(X_trans):
 # We set ``n_components`` to 30; however, to achieve the best performance, we would
 # need to find the optimal value for this hyperparameter using either |GridSearchCV|
 # or |RandomizedSearchCV|. We skip this part to keep the computation time for this
-# example small.
+# small example.
 #
 # Recall that the ROC AUC is a metric that quantifies the ranking power of estimators,
 # where a random estimator scores 0.5, and an oracle —providing perfect predictions—
@@ -221,6 +224,26 @@ def plot_box_results(named_results):
 
 plot_box_results(results)
 
+# %%
+# |TextEncoder| embeddings are very strong, but they are also quite expensive to
+# use. A simpler, faster alternative for encoding strings is the |StringEncoder|,
+# which works by first performing a tf-idf (computing vectors of rescaled word
+# counts, [wiki](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)) of the text, and then
+# following it with TruncatedSVD to reduce the number of dimensions to, in this
+# case, 30.
+from skrub import StringEncoder
+
+string_encoder = StringEncoder(n_components=30)
+
+string_encoder_pipe = clone(gap_pipe).set_params(
+    **{"tablevectorizer__high_cardinality": string_encoder}
+)
+string_encoder_results = cross_validate(string_encoder_pipe, X, y, scoring="roc_auc")
+results.append(("StringEncoder", string_encoder_results))
+
+plot_box_results(results)
+
+
 # %%
 # The performance of the |TextEncoder| is significantly stronger than that of
 # the syntactic encoders, which is expected. But how long does it take to load
@@ -232,7 +255,7 @@ def plot_box_results(named_results):
 
 def plot_performance_tradeoff(results):
     fig, ax = plt.subplots(figsize=(5, 4), dpi=200)
-    markers = ["s", "o", "^"]
+    markers = ["s", "o", "^", "x"]
     for idx, (name, result) in enumerate(results):
         ax.scatter(
             result["fit_time"],
@@ -293,8 +316,11 @@ def plot_performance_tradeoff(results):
 # During the subsequent cross-validation iterations, the model is simply copied,
 # which reduces computation time for the remaining folds.
 #
+# Interestingly, |StringEncoder| has a performance remarkably similar to that of
+# |GapEncoder|, while being significantly faster.
 # Conclusion
 # ----------
 # In conclusion, |TextEncoder| provides powerful vectorization for text, but at
 # the cost of longer computation times and the need for additional dependencies,
-# such as torch.
+# such as torch. \StringEncoder| represents a simpler alternative that can provide
+# good performance at a fraction of the cost of more complex methods.
diff --git a/examples/results.png b/examples/results.png
diff --git a/skrub/__init__.py b/skrub/__init__.py
@@ -17,6 +17,7 @@
 from ._reporting import TableReport, patch_display, unpatch_display
 from ._select_cols import DropCols, SelectCols
 from ._similarity_encoder import SimilarityEncoder
+from ._string_encoder import StringEncoder
 from ._table_vectorizer import TableVectorizer
 from ._tabular_learner import tabular_learner
 from ._text_encoder import TextEncoder
@@ -53,5 +54,6 @@
     "SelectCols",
     "DropCols",
     "TextEncoder",
+    "StringEncoder",
     "column_associations",
 ]
diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py
@@ -0,0 +1,201 @@
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import (
+    HashingVectorizer,
+    TfidfTransformer,
+    TfidfVectorizer,
+)
+from sklearn.pipeline import Pipeline
+from sklearn.utils.validation import check_is_fitted
+
+from . import _dataframe as sbd
+from ._on_each_column import SingleColumnTransformer
+
+
+class StringEncoder(SingleColumnTransformer):
+    """Generate a lightweight string encoding of a given column using tf-idf \
+        vectorization and truncated SVD.
+
+    First, apply a tf-idf vectorization of the text, then reduce the dimensionality
+    with a truncated SVD decomposition with the given number of parameters.
+
+    New features will be named `{col_name}_{component}` if the series has a name,
+    and `tsvd_{component}` if it does not.
+
+    Parameters
+    ----------
+    n_components : int, default=30
+        Number of components to be used for the PCA decomposition. Must be a
+        positive integer.
+    vectorizer : str, "tfidf" or "hashing"
+        Vectorizer to apply to the strings, either `tfidf` or `hashing` for
+        scikit-learn TfidfVectorizer or HashingVectorizer respectively.
+
+    ngram_range : tuple of (int, int) pairs, default=(3,4)
+        Whether the feature should be made of word or character n-grams.
+        Option ‘char_wb’ creates character n-grams only from text inside word
+        boundaries; n-grams at the edges of words are padded with space.
+
+    analyzer : str, "char", "word" or "char_wb", default="char_wb"
+        The lower and upper boundary of the range of n-values for different
+        n-grams to be extracted. All values of n such that min_n <= n <= max_n
+        will be used. For example an `ngram_range` of `(1, 1)` means only unigrams,
+        `(1, 2)` means unigrams and bigrams, and `(2, 2)` means only bigrams.
+
+    See Also
+    --------
+    MinHashEncoder :
+        Encode string columns as a numeric array with the minhash method.
+    GapEncoder :
+        Encode string columns by constructing latent topics.
+    SimilarityEncoder :
+        Encode string columns as a numeric array with n-gram string similarity.
+    TextEncoder :
+        Encode string columns using pre-trained language models.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> from skrub import StringEncoder
+
+    We will encode the comments using 2 components:
+
+    >>> enc = StringEncoder(n_components=2)
+    >>> X = pd.Series([
+    ...   "The professor snatched a good interview out of the jaws of these questions.",
+    ...   "Bookmarking this to watch later.",
+    ...   "When you don't know the lyrics of the song except the chorus",
+    ... ], name='video comments')
+
+    >>> enc.fit_transform(X) # doctest: +SKIP
+       video comments_0  video comments_1
+    0      8.218069e-01      4.557474e-17
+    1      6.971618e-16      1.000000e+00
+    2      8.218069e-01     -3.046564e-16
+    """
+
+    def __init__(
+        self,
+        n_components=30,
+        vectorizer="tfidf",
+        ngram_range=(3, 4),
+        analyzer="char_wb",
+    ):
+        self.n_components = n_components
+        self.vectorizer = vectorizer
+        self.ngram_range = ngram_range
+        self.analyzer = analyzer
+
+    def get_feature_names_out(self):
+        """Get output feature names for transformation.
+
+        Returns
+        -------
+        feature_names_out : list of str objects
+            Transformed feature names.
+        """
+        return list(self.all_outputs_)
+
+    def fit_transform(self, X, y=None):
+        """Fit the encoder and transform a column.
+
+        Parameters
+        ----------
+        X : Pandas or Polars series
+            The column to transform.
+        y : None
+            Unused. Here for compatibility with scikit-learn.
+
+        Returns
+        -------
+        X_out: Pandas or Polars dataframe with shape (len(X), tsvd_n_components)
+            The embedding representation of the input.
+        """
+        del y
+
+        # ERROR CHECKING
+        if self.analyzer not in ["char_wb", "char", "word"]:
+            raise ValueError(f"Unknown analyzer {self.analyzer}")
+
+        if not all(isinstance(x, int) and x > 0 for x in self.ngram_range):
+            raise ValueError(
+                "Values in `ngram_range` must be positive integers, "
+                f"found {self.ngram_range} instead."
+            )
+        if not len(self.ngram_range) == 2:
+            raise ValueError(
+                f"`ngram_range` must have length 2, found {len(self.ngram_range)}."
+            )
+
+        if not isinstance(self.n_components, int) and self.n_components > 0:
+            raise ValueError(
+                f"`n_components` must be a positive integer, found {self.n_components}"
+            )
+
+        if self.vectorizer == "tfidf":
+            self.pipe = Pipeline(
+                [
+                    (
+                        "tfidf",
+                        TfidfVectorizer(
+                            ngram_range=self.ngram_range, analyzer=self.analyzer
+                        ),
+                    ),
+                    ("tsvd", TruncatedSVD(n_components=self.n_components)),
+                ]
+            )
+
+        elif self.vectorizer == "hashing":
+            pipe_elements = [
+                (
+                    "hashing",
+                    HashingVectorizer(
+                        ngram_range=self.ngram_range, analyzer=self.analyzer
+                    ),
+                ),
+            ]
+            pipe_elements.append(("tfidf", TfidfTransformer()))
+            pipe_elements.append(("tsvd", TruncatedSVD(n_components=self.n_components)))
+            self.pipe = Pipeline(pipe_elements)
+        else:
+            raise ValueError(f"Unknown vectorizer {self.vectorizer}.")
+
+        name = sbd.name(X)
+        if not name:
+            name = "tsvd"
+        self.all_outputs_ = [f"{name}_{idx}" for idx in range(self.n_components)]
+
+        result = self.pipe.fit_transform(sbd.to_numpy(X))
+
+        self._is_fitted = True
+
+        return self._transform(X, result)
+
+    def transform(self, X):
+        """Transform a column.
+
+        Parameters
+        ----------
+        X : Pandas or Polars series
+            The column to transform.
+
+        Returns
+        -------
+        X_out: Pandas or Polars dataframe with shape (len(X), tsvd_n_components)
+            The embedding representation of the input.
+        """
+        check_is_fitted(self)
+
+        result = self.pipe.transform(sbd.to_numpy(X))
+        return self._transform(X, result)
+
+    def _transform(self, X, result):
+        result = sbd.make_dataframe_like(X, dict(zip(self.all_outputs_, result.T)))
+        result = sbd.copy_index(X, result)
+
+        return result
+
+    def __sklearn_is_fitted__(self):
+        """
+        Check fitted status and return a Boolean value.
+        """
+        return hasattr(self, "_is_fitted") and self._is_fitted