ntumlgroup · Gordon119 · Jul 16, 2024 · Jul 19, 2024
@@ -0,0 +1,10 @@
+Data Preparation
+================
+
+.. toctree::
+    :maxdepth: 1
+    :titlesonly:
+
+
+    ../auto_examples/plot_dataset_tutorial
+    ../auto_examples/plot_linear_feature_gen
@@ -1,8 +1,8 @@
 """
-An Example of Using Data Stored in Different Forms
+Using Data not in Default Forms
 ===================================================
 
-Different data sets are stored in various structures and formats.
+Different datasets are stored in various structures and formats.
 To apply LibMultiLabel with any of them, one must convert the data to a form accepted by the library first.
 In this tutorial, we demonstrate an example of converting a hugging face data set.
 Before we start, note that LibMultiLabel format consists of IDs (optional), labels, and raw texts.
@@ -21,8 +21,8 @@
 from datasets import load_dataset
 
 ######################################################################
-# We choose a multi-label set ``emoji`` from ``tweet_eval`` in this example.
-# The data set can be loaded by the following code.
+# We choose a multi-label dataset ``emoji`` from ``tweet_eval`` in this example.
+# The dataset can be loaded by the following code.
 
 hf_datasets = dict()
 hf_datasets["train"] = load_dataset("tweet_eval", "emoji", split="train")
@@ -60,9 +60,9 @@
 datasets = preprocessor.fit_transform(datasets)
 
 ###############################################################################
-# Also, if you want to use a NN model,
+# In this case, if you want to use a deep learning model,
 # use ``load_datasets`` from ``libmultilabel.nn.data_utils`` and change the data to the dataframes we created.
-# Here is the modification of our `Bert model quickstart <https://www.csie.ntu.edu.tw/~cjlin/libmultilabel/auto_examples/plot_BERT_quickstart.html>`_.
+# Here is the modification of our `Bert model quickstart <../auto_examples/plot_bert_quickstart.html>`_.
 
 from libmultilabel.nn.data_utils import load_datasets
 

@@ -0,0 +1,33 @@
+"""
+Tweaking Feature Generation for Linear Methods
+=============================================================
+
+In both `API  <../auto_examples/plot_linear_quickstart.html>`_ and `CLI  <../cli/linear.html>`_ usage of linear methods, LibMultiLabel handles the feature generation step by default.
+Unless necessary, you do not need to generate features in different ways as described in this tutorial.
+
+This tutorial demonstrates how to customize the way to generate features for linear methods through an API example.
+Here we use the `rcv1 <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html#rcv1v2%20(topics;%20full%20sets)>`_ dataset as an example.
+"""
+
+from sklearn.preprocessing import MultiLabelBinarizer
+from libmultilabel import linear
+
+datasets = linear.load_dataset("txt", "data/rcv1/train.txt", "data/rcv1/test.txt")
+tfidf_params = {
+    "max_features": 20000,
+    "min_df": 3,
+    "ngram_range": (1, 3)
+}
+preprocessor = linear.Preprocessor(tfidf_params=tfidf_params)
+preprocessor.fit(datasets)
+datasets = preprocessor.transform(datasets)
+
+############################################
+# The argument ``tfidf_params`` of the ``Preprocessor`` can specify how to generate the TF-IDF features.
+# In this example, we adjust the ``max_features``, ``min_df``, and ``ngram_range`` of the preprocessor.
+# For explanation of these three and other options, refer to the `sklearn page <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`_.
+# Users can also try other methods to generalize features, like word embedding.
+#
+# Finally, we use the generated numerical features to train and evaluate the model.
+# The rest of the steps is the same in the quickstarts.
+# Please refer to them for details.
@@ -1,12 +1,10 @@
 """
-Feature Generation and Parameter Selection for Linear Methods
+Hyperparameter Search for Linear Methods
 =============================================================
+This guide helps users to tune the hyperparameters of the feature generation step and the linear model.
 
-This tutorial demonstrates feature generation and parameter selection for linear methods.
-
-Here we show an example of training a linear text classifier with the rcv1 dataset.
-If you haven't downloaded it yet, see `Data Preparation  <../cli/linear.html#step-1-data-preparation>`_.
-Then you can read and preprocess the data as follows
+Here we show an example of tuning a linear text classifier with the `rcv1 dataset <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html#rcv1v2%20(topics;%20full%20sets)>`_.
+Starting with loading and preprocessing of the data without using ``Preprocessor``:
 """
 
 from sklearn.preprocessing import MultiLabelBinarizer
@@ -17,33 +15,9 @@
 y = binarizer.fit_transform(datasets["train"]["y"]).astype("d")
 
 ######################################################################
-# We format labels into a 0/1 sparse matrix with ``MultiLabelBinarizer``.
-#
-# Feature Generation
-# ------------------
-# Before training a linear classifier, we must convert each text to a vector of numerical features.
-# To use the default setting (TF-IDF features), check
-# `Linear Model for MultiLabel Classification <../auto_examples/plot_linear_quickstart.html#linear-model-for-multi-label-classification>`_
-# for easily conducting training and testing.
-#
-# If you want to tweak the generation of TF-IDF features, consider
-
-from sklearn.feature_extraction.text import TfidfVectorizer
-
-vectorizer = TfidfVectorizer(max_features=20000, min_df=3)
-x = vectorizer.fit_transform(datasets["train"]["x"])
-model = linear.train_1vsrest(y, x, "-s 2 -m 4")
-
-#######################################################################
-# We use the generated numerical features ``x`` as the input of
-# the linear method ``linear.train_1vsrest``.
+# we format labels into a 0/1 sparse matrix with ``MultiLabelBinarizer``.
 #
-# An Alternative Way for Using a Linear Method
-# --------------------------------------------
-# Besides the default way shown in `Feature Generation <#feature-generation>`_,
-# we can construct a sklearn estimator for training and prediction.
-# This way is used namely for parameter selection described later,
-# as the estimator makes LibMultiLabel methods in a sklearn Pipeline for a grid search.
+# Next, we construct a ``Pipeline`` object that will be used for hyperparameter search later.
 
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.pipeline import Pipeline
@@ -56,53 +30,50 @@
 )
 
 ######################################################################
-# For the estimator ``MultiLabelEstimator``, arguments ``options`` is a LIBLINEAR option
+# The vectorizor ``TfidfVectorizer`` is used in ``Pipeline`` to generate TF-IDF features from raw texts.
+# As for the estimator ``MultiLabelEstimator``, argument ``options`` is a LIBLINEAR option
 # (see *train Usage* in `liblinear <https://github.com/cjlin1/liblinear>`__ README), and
-# ``linear_technique`` is one of linear techniques: ``1vsrest``, ``thresholding``, ``cost_sensitive``,
-# ``cost_sensitive_micro``, and ``binary_and_mulitclass``.
-# In ``pipeline``, we specify settings used by the estimator.
+# ``linear_technique`` is one of the linear techniques, including ``1vsrest``, ``thresholding``, ``cost_sensitive``,
+# ``cost_sensitive_micro``, and ``binary_and_multiclass``.
+#
+# We can specify the aliases of the components used by the pipeline.
 # For example, ``tfidf`` is the alias of ``TfidfVectorizer`` and ``clf`` is the alias of the estimator.
 #
-# We can then use the following code for training.
-pipeline.fit(datasets["train"]["x"], y)
-
-######################################################################
-# Grid Search over Feature Generations and LIBLINEAR Options
-# -----------------------------------------------------------
-# To search for the best setting, we can employ ``GridSearchCV``.
+# To search for the best setting, we employ ``GridSearchCV``.
 # The usage is similar to sklearn's except that the parameter ``scoring`` is not available.  Please specify
 # ``scoring_metric`` in ``linear.MultiLabelEstimator`` instead.
-liblinear_options = ["-s 2 -c 0.5", "-s 2 -c 1", "-s 2 -c 2"]
+
+liblinear_options = ["-s 2 -c 0.5", "-s 2 -c 1", "-s 2 -c 2", "-s 1 -c 0.5", "-s 1 -c 1", "-s 1 -c 2"]
 parameters = {"clf__options": liblinear_options, "tfidf__max_features": [10000, 20000, 40000], "tfidf__min_df": [3, 5]}
 clf = linear.GridSearchCV(pipeline, parameters, cv=5, n_jobs=4, verbose=1)
 clf = clf.fit(datasets["train"]["x"], y)
 
 ######################################################################
-# Here we check the combinations of six feature generations and three regularization parameters
+# Here we check the combinations of six feature generation options and six liblinear options
 # in the linear classifier. The key in ``parameters`` should follow the sklearn's coding rule
 # starting with the estimator's alias and two underscores (i.e., ``clf__``).
 # We specify ``n_jobs=4`` to run four tasks in parallel.
-# After finishing gridsearch, we can get the best parameters by the following code:
+# After finishing the grid search, we can get the best parameters by the following code:
 
 for param_name in sorted(parameters.keys()):
     print(f"{param_name}: {clf.best_params_[param_name]}")
 
 ######################################################################
 # The best parameters are::
 #
-#   clf__options: '-s 2 -c 0.5 -m 1'
-#   tfidf__max_features: 20000
-#   tfidf__min_df: 3
+#   clf__options: -s 2 -c 0.5 -m 1                      
+#   tfidf__max_features: 10000                          
+#   tfidf__min_df: 5
 #
-# For testing, we also need to read in data first and format test labels into a 0/1 sparse matrix.
-
-y = binarizer.transform(datasets["test"]["y"]).astype("d").toarray()
-
-######################################################################
-# Applying the ``predict`` function of ``GridSearchCV`` object to use the
-# estimator trained under the best hyper-parameters for prediction.
+# Note that in the above code, the ``refit`` argument of ``GridSearchCV`` is enabled by default, meaning that the best configuration will be trained on the whole dataset after hyperparameter search.
+# We refer to this as the retrain strategy.
+# After fitting ``GridSearchCV``, the retrained model is stored in ``clf``. 
+#
+# We can apply the ``predict`` function of ``GridSearchCV`` object to use the estimator trained under the best hyperparameters for prediction.
 # Then use ``linear.compute_metrics`` to calculate the test performance.
 
+# For testing, we also need to read in data first and format test labels into a 0/1 sparse matrix.
+y = binarizer.transform(datasets["test"]["y"]).astype("d").toarray()
 preds = clf.predict(datasets["test"]["x"])
 metrics = linear.compute_metrics(
     preds,
@@ -114,4 +85,4 @@
 ######################################################################
 # The result of the best parameters will look similar to::
 #
-#   {'Macro-F1': 0.4965720851051106, 'Micro-F1': 0.8004678830627301, 'P@1': 0.9587412721675744, 'P@3': 0.8021469454453142, 'P@5': 0.5605401496291271}
+#   {'Macro-F1': 0.5296621774388927, 'Micro-F1': 0.8021279986938116, 'P@1': 0.9561621216872636, 'P@3': 0.7983185389507189, 'P@5': 0.5570921518306848}
@@ -1,14 +1,25 @@
 """
-Handling Data with Many Labels
-==============================
+Handling Data with Many Labels Using Linear Methods
+====================================================
 
 For the case that the amount of labels is very large,
 the training time of the standard ``train_1vsrest`` method may be unpleasantly long.
 The ``train_tree`` method in LibMultiLabel can vastly improve the training time on such data sets.
 
-To illustrate this speedup, we will use the `EUR-Lex dataset <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html#EUR-Lex>`_,
-which contains 3,956 labels.
-In this example, the data is downloaded under the directory ``data/eur-lex``.
+To illustrate this speedup, we will use the `EUR-Lex dataset <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html#EUR-Lex>`_, which contains 3,956 labels.
+The data in the following example is downloaded under the directory ``data/eur-lex``
+
+Users can use the following command to easily apply the ``train_tree`` method.
+
+.. code-block:: bash
+
+    $ python3 main.py --training_file data/eur-lex/train.txt
+                      --test_file data/eur-lex/test.txt
+                      --linear
+                      --linear_technique tree
+
+Besides CLI usage, users can also use API to apply ``train_tree`` method.
+Below is an example.
 """
 
 import math
@@ -88,6 +99,3 @@ def metrics_in_batches(model):
 print("Score of 1vsrest:", metrics_in_batches(ovr_model))
 print("Score of tree:", metrics_in_batches(tree_model))
 
-######################################################################
-#
-# .. bibliography::