Merge branch 'main' of github.com:alibaba/data-juicer into debug/gen_…

…qa_from_example
modelscope · Dec 23, 2024 · 5d1b6bd · 5d1b6bd
2 parents be9fc91 + a26dcc7
commit 5d1b6bd
Show file tree

Hide file tree

Showing 100 changed files with 4,051 additions and 614 deletions.
diff --git a/README.md b/README.md
@@ -197,6 +197,22 @@ The dependency options are listed below:
 | `.[tools]`       | Install dependencies for dedicated tools, such as quality classifiers.                       |
 | `.[sandbox]`     | Install all dependencies for sandbox.                                                        |
 
+- Install dependencies for specific OPs
+
+With the growth of the number of OPs, the dependencies of all OPs becomes very heavy. Instead of using the command `pip install -v -e .[sci]` to install all dependencies,
+we provide two alternative, lighter options:
+
+  - Automatic Minimal Dependency Installation: During the execution of Data-Juicer, minimal dependencies will be automatically installed. This allows for immediate execution, but may potentially lead to dependency conflicts.
+
+  - Manual Minimal Dependency Installation: To manually install minimal dependencies tailored to a specific execution configuration, run the following command:
+    ```shell
+    # only for installation from source
+    python tools/dj_install.py --config path_to_your_data-juicer_config_file
+
+    # use command line tool
+    dj-install --config path_to_your_data-juicer_config_file
+    ```
+
 ### Using pip
 
 - Run the following command to install the latest released `data_juicer` using `pip`:
@@ -317,9 +333,16 @@ python tools/analyze_data.py --config configs/demo/analyzer.yaml
 
 # use command line tool
 dj-analyze --config configs/demo/analyzer.yaml
+
+# you can also use auto mode to avoid writing a recipe. It will analyze a small
+# part (e.g. 1000 samples, specified by argument `auto_num`) of your dataset 
+# with all Filters that produce stats.
+dj-analyze --auto --dataset_path xx.jsonl [--auto_num 1000]
 ```
 
-- **Note:** Analyzer only compute stats of Filter ops. So extra Mapper or Deduplicator ops will be ignored in the analysis process.
+- **Note:** Analyzer only compute stats for Filters that produce stats or other OPs that produce tags/categories in meta. So other OPs will be ignored in the analysis process. We use the following registries to decorate OPs:
+  - `NON_STATS_FILTERS`: decorate Filters that **DO NOT** produce any stats.
+  - `TAGGING_OPS`: decorate OPs that **DO** produce tags/categories in meta field.
 
 ### Data Visualization
 

diff --git a/README_ZH.md b/README_ZH.md
@@ -178,6 +178,21 @@ pip install -v -e .[tools] # 安装部分工具库的依赖
 | `.[tools]`       | 安装专用工具库（如质量分类器）所需的依赖项        |
 | `.[sandbox]`     | 安装沙盒实验室的基础依赖                 |
 
+* 只安装部分算子依赖
+
+随着OP数量的增长，所有OP的依赖变得很重。为此，我们提供了两个替代的、更轻量的选项，作为使用命令`pip install -v -e .[sci]`安装所有依赖的替代：
+
+  * 自动最小依赖安装：在执行Data-Juicer的过程中，将自动安装最小依赖。也就是说你可以直接执行，但这种方式可能会导致一些依赖冲突。
+
+  * 手动最小依赖安装：可以通过如下指令手动安装适合特定执行配置的最小依赖：
+    ```shell
+    # 适用于从源码安装
+    python tools/dj_install.py --config path_to_your_data-juicer_config_file
+
+    # 使用命令行工具
+    dj-install --config path_to_your_data-juicer_config_file
+    ```
+
 ### 使用 pip 安装
 
 * 运行以下命令用 `pip` 安装 `data_juicer` 的最新发布版本：
@@ -295,9 +310,15 @@ python tools/analyze_data.py --config configs/demo/analyzer.yaml
 
 # 使用命令行工具
 dj-analyze --config configs/demo/analyzer.yaml
+
+# 你也可以使用"自动"模式来避免写一个新的数据菜谱。它会使用全部可产出统计信息的 Filter 来分析
+# 你的数据集的一小部分（如1000条样本，可通过 `auto_num` 参数指定）
+dj-analyze --auto --dataset_path xx.jsonl [--auto_num 1000]
 ```
 
-* **注意**：Analyzer 只计算 Filter 算子的状态，其他的算子（例如 Mapper 和 Deduplicator）会在分析过程中被忽略。
+* **注意**：Analyzer 只用于能在 stats 字段里产出统计信息的 Filter 算子和能在 meta 字段里产出 tags 或类别标签的其他算子。除此之外的其他的算子会在分析过程中被忽略。我们使用以下两种注册器来装饰相关的算子：
+  * `NON_STATS_FILTERS`：装饰那些**不能**产出任何统计信息的 Filter 算子。
+  * `TAGGING_OPS`：装饰那些能在 meta 字段中产出 tags 或类别标签的算子。
 
 ### 数据可视化
 

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
diff --git a/data_juicer/__init__.py b/data_juicer/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '1.0.1'
+__version__ = '1.0.2'
 
 import os
 import subprocess

diff --git a/data_juicer/analysis/column_wise_analysis.py b/data_juicer/analysis/column_wise_analysis.py
@@ -4,8 +4,9 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 from tqdm import tqdm
+from wordcloud import WordCloud
 
-from data_juicer.utils.constant import Fields
+from data_juicer.utils.constant import DEFAULT_PREFIX, Fields
 
 from .overall_analysis import OverallAnalysis
 
@@ -69,6 +70,12 @@ def __init__(self,
             stats into one image file
         """
         self.stats = pd.DataFrame(dataset[Fields.stats])
+        self.meta = pd.DataFrame(dataset[Fields.meta])
+        # remove non-tag columns
+        meta_columns = self.meta.columns
+        for col_name in meta_columns:
+            if not col_name.startswith(DEFAULT_PREFIX):
+                self.meta = self.meta.drop(col_name, axis=1)
         self.output_path = output_path
         if not os.path.exists(self.output_path):
             os.makedirs(self.output_path)
@@ -100,8 +107,9 @@ def analyze(self, show_percentiles=False, show=False, skip_export=False):
         width_unit = 4
         height_unit = 6
 
-        columns = self.stats.columns
-        num = len(columns)
+        stats_and_meta = pd.concat([self.stats, self.meta], axis=1)
+        all_columns = stats_and_meta.columns
+        num = len(all_columns)
 
         # get the recommended "best" number of columns and rows
         rec_row, rec_col, grid_indexes = get_row_col(num, num_subcol)
@@ -114,9 +122,9 @@ def analyze(self, show_percentiles=False, show=False, skip_export=False):
             fig = plt.figure(figsize=(rec_width, rec_height),
                              layout='constrained')
             subfigs = fig.subfigures(rec_row, rec_col, wspace=0.01)
-        for i, column_name in enumerate(tqdm(columns.to_list(),
-                                             desc='Column')):
-            data = self.stats[column_name]
+        for i, column_name in enumerate(
+                tqdm(all_columns.to_list(), desc='Column')):
+            data = stats_and_meta[column_name]
             # explode data to flatten inner list
             data = data.explode().infer_objects()
             grid = grid_indexes[i]
@@ -145,33 +153,39 @@ def analyze(self, show_percentiles=False, show=False, skip_export=False):
                 else:
                     axes = [None] * num_subcol
 
-                # draw histogram
-                self.draw_hist(axes[0],
-                               data,
-                               os.path.join(self.output_path,
-                                            f'{column_name}-hist.png'),
-                               percentiles=percentiles)
-
-                # draw box
-                self.draw_box(axes[1],
-                              data,
-                              os.path.join(self.output_path,
-                                           f'{column_name}-box.png'),
-                              percentiles=percentiles)
+                if not skip_export:
+                    # draw histogram
+                    self.draw_hist(axes[0],
+                                   data,
+                                   os.path.join(self.output_path,
+                                                f'{column_name}-hist.png'),
+                                   percentiles=percentiles)
+
+                    # draw box
+                    self.draw_box(axes[1],
+                                  data,
+                                  os.path.join(self.output_path,
+                                               f'{column_name}-box.png'),
+                                  percentiles=percentiles)
             else:
                 # object (string) or string list -- only draw histogram for
                 # this stat
                 if self.save_stats_in_one_file:
-                    axes = subfig.subplots(1, 1)
+                    axes = subfig.subplots(1, num_subcol)
                 else:
-                    axes = None
+                    axes = [None] * num_subcol
 
                 if not skip_export:
                     self.draw_hist(
-                        axes, data,
+                        axes[0], data,
                         os.path.join(self.output_path,
                                      f'{column_name}-hist.png'))
 
+                    self.draw_wordcloud(
+                        axes[1], data,
+                        os.path.join(self.output_path,
+                                     f'{column_name}-wordcloud.png'))
+
             # add a title to the figure of this stat
             if self.save_stats_in_one_file:
                 subfig.suptitle(f'{data.name}',
@@ -203,10 +217,7 @@ def draw_hist(self, ax, data, save_path, percentiles=None, show=False):
         """
         # recommended number of bins
         data_num = len(data)
-        if data_num >= 100:
-            rec_bins = int(math.sqrt(len(data)))
-        else:
-            rec_bins = None
+        rec_bins = max(int(math.sqrt(data_num)), 10)
 
         # if ax is None, using plot method in pandas
         if ax is None:
@@ -297,3 +308,33 @@ def draw_box(self, ax, data, save_path, percentiles=None, show=False):
                 # accumulated overlapped figures in different draw_xxx function
                 # calling
                 ax.clear()
+
+    def draw_wordcloud(self, ax, data, save_path, show=False):
+        word_list = data.tolist()
+        word_nums = {}
+        for w in word_list:
+            if w in word_nums:
+                word_nums[w] += 1
+            else:
+                word_nums[w] = 1
+
+        wc = WordCloud(width=400, height=320)
+        wc.generate_from_frequencies(word_nums)
+
+        if ax is None:
+            ax = plt.figure(figsize=(20, 16))
+        else:
+            ax.imshow(wc, interpolation='bilinear')
+            ax.axis('off')
+
+        if not self.save_stats_in_one_file:
+            # save into file
+            wc.to_file(save_path)
+
+            if show:
+                plt.show()
+            else:
+                # if no showing, we need to clear this axes to avoid
+                # accumulated overlapped figures in different draw_xxx function
+                # calling
+                ax.clear()
diff --git a/data_juicer/analysis/measure.py b/data_juicer/analysis/measure.py
@@ -1,9 +1,13 @@
+import numpy as np
+
 from data_juicer.utils.lazy_loader import LazyLoader
 
 torch = LazyLoader('torch', 'torch')
 td = LazyLoader('td', 'torch.distributions')
 F = LazyLoader('F', 'torch.nn.functional')
 
+stats = LazyLoader('stats', 'scipy.stats')
+
 
 class Measure(object):
     """Base class for Measure distribution.
@@ -48,6 +52,15 @@ def _convert_to_categorical(self, p):
         else:
             return td.Categorical(torch.tensor(p))
 
+    def _convert_to_ndarray(self, p):
+        """
+        Convert input data to torch tensor.
+        :param p: input data, now support
+            [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`].
+        :return: torch tensor
+        """
+        return self._convert_to_tensor(p).numpy()
+
 
 class KLDivMeasure(Measure):
     """
@@ -108,3 +121,101 @@ class EntropyMeasure(Measure):
     def measure(self, p):
         p = self._convert_to_categorical(p)
         return p.entropy()
+
+
+class RelatedTTestMeasure(Measure):
+    """
+    Measure T-Test for two related distributions on their histogram of the same
+    bins.
+
+    Ref:
+    https://en.wikipedia.org/wiki/Student%27s_t-test
+
+    For continuous features or distributions, the input could be dataset stats
+    list.
+    For discrete features or distributions, the input could be the tags or the
+    categories list.
+    """
+    name = 't-test'
+
+    @staticmethod
+    def stats_to_hist(p, q):
+        p = np.array(p)
+        q = np.array(q)
+
+        # get common maximum number of data samples, and max/min values
+        max_data_num = max(len(p), len(q))
+        min_val = min(min(p), min(q))
+        max_val = max(max(p), max(q))
+
+        # get a recommended number of bins
+        rec_bins = max(int(np.sqrt(max_data_num)), 10)
+
+        # get the common bin edges
+        common_p = np.append(p, [min_val, max_val])
+        hist_p, bin_edges = np.histogram(common_p, bins=rec_bins)
+        # restore the hist of the original p
+        hist_p[0] -= 1
+        hist_p[-1] -= 1
+        # get the hist of the original q using the common bin edges
+        hist_q, _ = np.histogram(q, bins=bin_edges)
+        return hist_p, hist_q, bin_edges
+
+    @staticmethod
+    def category_to_hist(p, q):
+
+        def flatten_list(lst):
+            res = []
+            for s in lst:
+                if isinstance(s, list):
+                    res.extend(flatten_list(s))
+                else:
+                    res.append(s)
+            return res
+
+        # flatten the list
+        p = flatten_list(p)
+        q = flatten_list(q)
+
+        # get the common categories
+        cat_p = set(p)
+        cat_q = set(q)
+        cat_common = cat_p.union(cat_q)
+
+        # get category distributions
+        count_p = {cat: 0 for cat in cat_common}
+        count_q = {cat: 0 for cat in cat_common}
+        for cat in p:
+            count_p[cat] += 1
+        for cat in q:
+            count_q[cat] += 1
+
+        # only keep distribution values sorted by counts
+        sorted_cat = list(count_p.items())
+        sorted_cat.sort(key=lambda it: it[1], reverse=True)
+        sorted_cat = [it[0] for it in sorted_cat]
+        # get the value dist
+        hist_p = [count_p[cat] for cat in sorted_cat]
+        hist_q = [count_q[cat] for cat in sorted_cat]
+
+        return hist_p, hist_q, count_p, count_q, sorted_cat
+
+    def measure(self, p, q):
+        """
+        :param p: the first feature or distribution. (stats/tags/categories)
+        :param q: the second feature or distribution. (stats/tags/categories)
+        :return: the T-Test results object -- ([ref](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats._result_classes.TtestResult.html#scipy.stats._result_classes.TtestResult))  # noqa: E501
+        """
+        ele = p[0]
+        while isinstance(ele, list):
+            ele = ele[0]
+        if isinstance(ele, str):
+            # discrete tags or categories
+            hist_p, hist_q = self.category_to_hist(p, q)[:2]
+        else:
+            # continuous stats
+            hist_p, hist_q = self.stats_to_hist(p, q)[:2]
+
+        # compute the t-test and pval for hist_p and hist_q
+        ttest_res = stats.ttest_rel(hist_p, hist_q)
+        return ttest_res