From 9f098bd24c38d36c719c3f264edd23b8e24617c4 Mon Sep 17 00:00:00 2001
From: Haibin <1400012807@pku.edu.cn>
Date: Fri, 20 Dec 2024 12:03:37 +0800
Subject: [PATCH] doc done
---
configs/config_all.yaml | 18 ++++++++++++++++++
.../ops/aggregator/meta_tags_aggregator.py | 2 +-
.../ops/grouper/naive_reverse_grouper.py | 2 +-
data_juicer/utils/auto_install_mapping.py | 1 +
docs/Operators.md | 12 ++++++++----
docs/Operators_ZH.md | 11 +++++++----
6 files changed, 36 insertions(+), 10 deletions(-)
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
index 462478c23..42d1e779e 100644
--- a/configs/config_all.yaml
+++ b/configs/config_all.yaml
@@ -792,6 +792,9 @@ process:
upper_percentile: # the upper bound of the percentile to be sampled
lower_rank: # the lower rank of the percentile to be sampled
upper_rank: # the upper rank of the percentile to be sampled
+ - tags_specified_field_selector: # Selector to select samples based on the tags of specified field.
+ field_key: '__dj__meta__.query_sentiment_label' # the target keys corresponding to multi-level field information need to be separated by '.'
+ target_tags: ['happy', 'sad'] # Target tags to be select.
- topk_specified_field_selector: # selector to select top samples based on the sorted specified field
field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.'
top_ratio: # ratio of selected top samples
@@ -800,6 +803,7 @@ process:
# Grouper ops.
- naive_grouper: # Group all samples to one batched sample.
+ - naive_reverse_grouper: # Split one batched sample to samples.
- key_value_grouper: # Group samples to batched samples according values in given keys.
group_by_keys: null # Group samples according values in the keys. Support for nested keys such as "__dj__stats__.text_len". It is [self.text_key] in default.
@@ -821,6 +825,20 @@ process:
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
+ - meta_tags_aggregator: # Merge similar meta tags to one tag.
+ api_model: 'gpt-4o' # API model name.
+ meta_tag_key: '__dj__meta__.query_sentiment_label' # The key of the meta tag to be mapped.
+ target_tags: ['开心', '难过', '其他'] # The tags that is supposed to be mapped to.
+ api_endpoint: null # URL endpoint for the API.
+ response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+ system_prompt: null # The system prompt.
+ input_template: null # The input template.
+ target_tag_template: null # The tap template for target tags.
+ tag_template: null # The tap template for each tag and its frequency.
+ output_pattern: null # The output pattern.
+ try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
+ model_params: {} # Parameters for initializing the API model.
+ sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- most_relavant_entities_aggregator: # Extract entities closely related to a given entity from some texts, and sort them in descending order of importance.
api_model: 'gpt-4o' # API model name.
entity: '孙悟空' # The given entity.
diff --git a/data_juicer/ops/aggregator/meta_tags_aggregator.py b/data_juicer/ops/aggregator/meta_tags_aggregator.py
index a60d9096a..808ef73da 100644
--- a/data_juicer/ops/aggregator/meta_tags_aggregator.py
+++ b/data_juicer/ops/aggregator/meta_tags_aggregator.py
@@ -16,7 +16,7 @@
@OPERATORS.register_module(OP_NAME)
class MetaTagsAggregator(Aggregator):
"""
- Merge similar meta tags to one tags.
+ Merge similar meta tags to one tag.
"""
DEFAULT_SYSTEM_PROMPT = ('给定一些标签以及这些标签出现的频次,合并意思相近的标签。\n'
diff --git a/data_juicer/ops/grouper/naive_reverse_grouper.py b/data_juicer/ops/grouper/naive_reverse_grouper.py
index 385a83821..2535205b9 100644
--- a/data_juicer/ops/grouper/naive_reverse_grouper.py
+++ b/data_juicer/ops/grouper/naive_reverse_grouper.py
@@ -3,7 +3,7 @@
@OPERATORS.register_module('naive_reverse_grouper')
class NaiveReverseGrouper(Grouper):
- """Split one batched sample to samples. """
+ """Split batched samples to samples. """
def __init__(self, *args, **kwargs):
"""
diff --git a/data_juicer/utils/auto_install_mapping.py b/data_juicer/utils/auto_install_mapping.py
index 2da2e3616..3b8ec20aa 100644
--- a/data_juicer/utils/auto_install_mapping.py
+++ b/data_juicer/utils/auto_install_mapping.py
@@ -103,4 +103,5 @@
'query_intent_detection_mapper': ['transformers'],
'query_sentiment_detection_mapper': ['transformers'],
'query_topic_detection_mapper': ['transformers'],
+ 'meta_tags_aggregator': ['openai'],
}
diff --git a/docs/Operators.md b/docs/Operators.md
index bd099fb07..963155333 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -14,9 +14,9 @@ The operators in Data-Juicer are categorized into 5 types.
| [ Mapper ]( #mapper ) | 70 | Edits and transforms samples |
| [ Filter ]( #filter ) | 44 | Filters out low-quality samples |
| [ Deduplicator ]( #deduplicator ) | 8 | Detects and removes duplicate samples |
-| [ Selector ]( #selector ) | 4 | Selects top samples based on ranking |
-| [ Grouper ]( #grouper ) | 2 | Group samples to batched samples |
-| [ Aggregator ]( #aggregator ) | 3 | Aggregate for batched samples, such as summary or conclusion |
+| [ Selector ]( #selector ) | 5 | Selects top samples based on ranking |
+| [ Grouper ]( #grouper ) | 3 | Group samples to batched samples |
+| [ Aggregator ]( #aggregator ) | 4 | Aggregate for batched samples, such as summary or conclusion |
All the specific operators are listed below, each featured with several capability tags.
@@ -199,20 +199,24 @@ All the specific operators are listed below, each featured with several capabili
| frequency_specified_field_selector | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Selects top samples by comparing the frequency of the specified field | [code](../data_juicer/ops/selector/frequency_specified_field_selector.py) | [tests](../tests/ops/selector/test_frequency_specified_field_selector.py) |
| random_selector | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Selects samples randomly | [code](../data_juicer/ops/selector/random_selector.py) | [tests](../tests/ops/selector/test_random_selector.py) |
| range_specified_field_selector | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Selects samples within a specified range by comparing the values of the specified field | [code](../data_juicer/ops/selector/range_specified_field_selector.py) | [tests](../tests/ops/selector/test_range_specified_field_selector.py) |
+| tags_specified_field_selector | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Select samples based on the tags of specified
+ field. | [code](../data_juicer/ops/selector/tags_specified_field_selector.py) | [tests](../tests/ops/selector/test_tags_specified_field_selector.py) |
| topk_specified_field_selector | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Selects top samples by comparing the values of the specified field | [code](../data_juicer/ops/selector/topk_specified_field_selector.py) | [tests](../tests/ops/selector/test_topk_specified_field_selector.py) |
## Grouper
| Operator | Tags | Description | Source code | Unit tests |
|------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------|-------------------------------------------------------------------------------|---------------------------------------------------------------------------|
-| key_value_grouper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Group samples to batched samples according values in given keys. | [code](../data_juicer/ops/grouper/key_value_grouper.py) | [tests](../tests/ops/grouper/test_key_value_grouper.py) |
| naive_grouper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Group all samples to one batched sample. | [code](../data_juicer/ops/grouper/naive_grouper.py) | [tests](../tests/ops/grouper/test_naive_grouper.py) |
+| naive_reverse_grouper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Split batched samples to samples. | [code](../data_juicer/ops/grouper/naive_reverse_grouper.py) | [tests](../tests/ops/grouper/test_naive_reverse_grouper.py) |
+| key_value_grouper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Group samples to batched samples according values in given keys. | [code](../data_juicer/ops/grouper/key_value_grouper.py) | [tests](../tests/ops/grouper/test_key_value_grouper.py) |
## Aggregator
| Operator | Tags | Description | Source code | Unit tests |
|------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------|-------------------------------------------------------------------------------|---------------------------------------------------------------------------|
| entity_attribute_aggregator | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Return conclusion of the given entity's attribute from some docs. | [code](../data_juicer/ops/aggregator/entity_attribute_aggregator.py) | [tests](../tests/ops/aggregator/test_entity_attribute_aggregator.py) |
+| meta_tags_aggregator | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Merge similar meta tags to one tag. | [code](../data_juicer/ops/aggregator/meta_tags_aggregator.py) | [tests](../tests/ops/aggregator/test_meta_tags_aggregator.py) |
| most_relavant_entities_aggregator | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Extract entities closely related to a given entity from some texts, and sort them in descending order of importance. | [code](../data_juicer/ops/aggregator/most_relavant_entities_aggregator.py) | [tests](../tests/ops/aggregator/test_most_relavant_entities_aggregator.py) |
| nested_aggregator | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Considering the limitation of input length, nested aggregate contents for each given number of samples. | [code](../data_juicer/ops/aggregator/nested_aggregator.py) | [tests](../tests/ops/aggregator/test_nested_aggregator.py) |
diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
index 749459ad7..6c22ac0fe 100644
--- a/docs/Operators_ZH.md
+++ b/docs/Operators_ZH.md
@@ -14,9 +14,9 @@ Data-Juicer 中的算子分为以下 5 种类型。
| [ Mapper ]( #mapper ) | 70 | 对数据样本进行编辑和转换 |
| [ Filter ]( #filter ) | 44 | 过滤低质量样本 |
| [ Deduplicator ]( #deduplicator ) | 8 | 识别、删除重复样本 |
-| [ Selector ]( #selector ) | 4 | 基于排序选取高质量样本 |
-| [ Grouper ]( #grouper ) | 2 | 将样本分组,每一组组成一个批量样本 |
-| [ Aggregator ]( #aggregator ) | 3 | 对批量样本进行汇总,如得出总结或结论 |
+| [ Selector ]( #selector ) | 5 | 基于排序选取高质量样本 |
+| [ Grouper ]( #grouper ) | 3 | 将样本分组,每一组组成一个批量样本 |
+| [ Aggregator ]( #aggregator ) | 4 | 对批量样本进行汇总,如得出总结或结论 |
下面列出所有具体算子,每种算子都通过多个标签来注明其主要功能。
@@ -198,20 +198,23 @@ Data-Juicer 中的算子分为以下 5 种类型。
| frequency_specified_field_selector | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 通过比较指定字段的频率选出前 k 个样本 | [code](../data_juicer/ops/selector/frequency_specified_field_selector.py) | [tests](../tests/ops/selector/test_frequency_specified_field_selector.py) |
| random_selector | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 随机筛选 k 个样本 | [code](../data_juicer/ops/selector/random_selector.py) | [tests](../tests/ops/selector/test_random_selector.py) |
| range_specified_field_selector | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 通过比较指定字段的值选出指定范围的 k 个样本 | [code](../data_juicer/ops/selector/range_specified_field_selector.py) | [tests](../tests/ops/selector/test_range_specified_field_selector.py) |
+| tags_specified_field_selector | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 通过指定字段的标签值筛选样例 | [code](../data_juicer/ops/selector/tags_specified_field_selector.py) | [tests](../tests/ops/selector/test_tags_specified_field_selector.py) |
| topk_specified_field_selector | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 通过比较指定字段的值选出前 k 个样本 | [code](../data_juicer/ops/selector/topk_specified_field_selector.py) | [tests](../tests/ops/selector/test_topk_specified_field_selector.py) |
## Grouper
| 算子 | 标签 | 描述 | 源码 | 单测样例 |
|-------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------|---------------------------------------------------------------------------|---------------------------------------------------------------------------|
+| naive_grouper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 将所有样本分为一个组,返回一个batch化的样本 | [code](../data_juicer/ops/grouper/naive_grouper.py) | [tests](../tests/ops/grouper/test_naive_grouper.py) |
+| naive_reverse_grouper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 将batch化的样本拆分成普通的样本 | [code](../data_juicer/ops/grouper/naive_reverse_grouper.py) | [tests](../tests/ops/grouper/test_naive_reverse_grouper.py) |
| key_value_grouper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 根据给定键的值将样本分组,每一组组成一个批量样本。 | [code](../data_juicer/ops/grouper/key_value_grouper.py) | [tests](../tests/ops/grouper/test_key_value_grouper.py) |
-| naive_grouper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 将所有样本分为一个组,返回一个批量样本 | [code](../data_juicer/ops/grouper/naive_grouper.py) | [tests](../tests/ops/grouper/test_naive_grouper.py) |
## Aggregator
| 算子 | 标签 | 描述 | 源码 | 单测样例 |
|-------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------|---------------------------------------------------------------------------|---------------------------------------------------------------------------|
| entity_attribute_aggregator | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 从一些文本中总结出给定实体的属性 | [code](../data_juicer/ops/aggregator/entity_attribute_aggregator.py) | [tests](../tests/ops/aggregator/test_entity_attribute_aggregator.py) |
+| meta_tags_aggregator | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 将相似的标签合并成同一个标签。 | [code](../data_juicer/ops/aggregator/meta_tags_aggregator.py) | [tests](../tests/ops/aggregator/test_meta_tags_aggregator.py) |
| most_relavant_entities_aggregator | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 从一些文本中抽取出与给定实体密切相关的实体,按重要性从高到低排序 | [code](../data_juicer/ops/aggregator/most_relavant_entities_aggregator.py) | [tests](../tests/ops/aggregator/test_most_relavant_entities_aggregator.py) |
| nested_aggregator | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 考虑到输入长度的限制,对样本中的内容进行嵌套聚合。 | [code](../data_juicer/ops/aggregator/nested_aggregator.py) | [tests](../tests/ops/aggregator/test_nested_aggregator.py) |