From 075eb52c48cf17edbb718134b1bbd31a43af2aa6 Mon Sep 17 00:00:00 2001 From: yxdyc Date: Thu, 26 Dec 2024 06:57:40 +0000 Subject: [PATCH] deploy: 7d5f37d6f7d5c41d135c7ff28ca5330c85cbbfec --- .buildinfo | 4 +- _modules/data_juicer.html | 22 +- _modules/data_juicer/analysis/collector.html | 188 + .../analysis/column_wise_analysis.html | 39 +- .../analysis/diversity_analysis.html | 45 +- _modules/data_juicer/analysis/draw.html | 154 + _modules/data_juicer/analysis/measure.html | 372 ++ .../analysis/overall_analysis.html | 30 +- _modules/data_juicer/config/config.html | 77 +- _modules/data_juicer/core/adapter.html | 40 +- _modules/data_juicer/core/analyzer.html | 28 +- _modules/data_juicer/core/data.html | 99 +- _modules/data_juicer/core/executor.html | 30 +- _modules/data_juicer/core/exporter.html | 36 +- _modules/data_juicer/core/monitor.html | 43 +- _modules/data_juicer/core/ray_data.html | 411 ++ _modules/data_juicer/core/ray_executor.html | 210 + _modules/data_juicer/core/tracer.html | 34 +- .../data_juicer/format/csv_formatter.html | 26 +- .../data_juicer/format/empty_formatter.html | 34 +- _modules/data_juicer/format/formatter.html | 57 +- .../data_juicer/format/json_formatter.html | 26 +- _modules/data_juicer/format/load.html | 24 +- .../data_juicer/format/mixture_formatter.html | 30 +- .../data_juicer/format/parquet_formatter.html | 26 +- .../data_juicer/format/text_formatter.html | 38 +- .../data_juicer/format/tsv_formatter.html | 26 +- .../entity_attribute_aggregator.html | 32 +- .../most_relavant_entities_aggregator.html | 32 +- .../ops/aggregator/nested_aggregator.html | 32 +- _modules/data_juicer/ops/base_op.html | 153 +- .../data_juicer/ops/common/helper_func.html | 58 +- .../deduplicator/document_deduplicator.html | 30 +- .../document_minhash_deduplicator.html | 40 +- .../document_simhash_deduplicator.html | 30 +- .../ops/deduplicator/image_deduplicator.html | 35 +- .../deduplicator/ray_basic_deduplicator.html | 32 +- .../ray_document_deduplicator.html | 28 +- .../deduplicator/ray_image_deduplicator.html | 33 +- .../deduplicator/ray_video_deduplicator.html | 28 +- .../ops/deduplicator/video_deduplicator.html | 30 +- .../ops/filter/alphanumeric_filter.html | 30 +- .../ops/filter/audio_duration_filter.html | 30 +- .../ops/filter/audio_nmf_snr_filter.html | 40 +- .../ops/filter/audio_size_filter.html | 30 +- .../filter/average_line_length_filter.html | 30 +- .../filter/character_repetition_filter.html | 30 +- .../ops/filter/flagged_words_filter.html | 30 +- .../ops/filter/image_aesthetics_filter.html | 30 +- .../ops/filter/image_aspect_ratio_filter.html | 30 +- .../ops/filter/image_face_count_filter.html | 30 +- .../ops/filter/image_face_ratio_filter.html | 30 +- .../ops/filter/image_nsfw_filter.html | 30 +- .../filter/image_pair_similarity_filter.html | 30 +- .../ops/filter/image_shape_filter.html | 30 +- .../ops/filter/image_size_filter.html | 30 +- .../filter/image_text_matching_filter.html | 30 +- .../filter/image_text_similarity_filter.html | 30 +- .../ops/filter/image_watermark_filter.html | 30 +- .../ops/filter/language_id_score_filter.html | 30 +- .../filter/maximum_line_length_filter.html | 30 +- .../ops/filter/perplexity_filter.html | 30 +- .../phrase_grounding_recall_filter.html | 45 +- .../ops/filter/special_characters_filter.html | 30 +- .../ops/filter/specified_field_filter.html | 30 +- .../specified_numeric_field_filter.html | 35 +- .../ops/filter/stopwords_filter.html | 30 +- .../data_juicer/ops/filter/suffix_filter.html | 30 +- .../ops/filter/text_action_filter.html | 30 +- .../filter/text_entity_dependency_filter.html | 30 +- .../ops/filter/text_length_filter.html | 30 +- .../ops/filter/token_num_filter.html | 30 +- .../ops/filter/video_aesthetics_filter.html | 30 +- .../ops/filter/video_aspect_ratio_filter.html | 30 +- .../ops/filter/video_duration_filter.html | 30 +- .../video_frames_text_similarity_filter.html | 30 +- .../ops/filter/video_motion_score_filter.html | 39 +- .../video_motion_score_raft_filter.html | 30 +- .../ops/filter/video_nsfw_filter.html | 30 +- .../filter/video_ocr_area_ratio_filter.html | 37 +- .../ops/filter/video_resolution_filter.html | 30 +- .../video_tagging_from_frames_filter.html | 30 +- .../ops/filter/video_watermark_filter.html | 30 +- .../ops/filter/word_repetition_filter.html | 30 +- .../ops/filter/words_num_filter.html | 30 +- .../ops/grouper/key_value_grouper.html | 28 +- .../ops/grouper/naive_grouper.html | 28 +- _modules/data_juicer/ops/load.html | 24 +- .../mapper/audio_ffmpeg_wrapped_mapper.html | 28 +- .../ops/mapper/calibrate_qa_mapper.html | 32 +- .../ops/mapper/calibrate_query_mapper.html | 26 +- .../ops/mapper/calibrate_response_mapper.html | 26 +- .../ops/mapper/chinese_convert_mapper.html | 33 +- .../ops/mapper/clean_copyright_mapper.html | 28 +- .../ops/mapper/clean_email_mapper.html | 28 +- .../ops/mapper/clean_html_mapper.html | 28 +- .../ops/mapper/clean_ip_mapper.html | 28 +- .../ops/mapper/clean_links_mapper.html | 28 +- .../ops/mapper/expand_macro_mapper.html | 28 +- .../extract_entity_attribute_mapper.html | 30 +- .../extract_entity_relation_mapper.html | 34 +- .../ops/mapper/extract_event_mapper.html | 30 +- .../ops/mapper/extract_keyword_mapper.html | 30 +- .../ops/mapper/extract_nickname_mapper.html | 30 +- .../mapper/extract_support_text_mapper.html | 28 +- .../ops/mapper/fix_unicode_mapper.html | 28 +- .../generate_qa_from_examples_mapper.html | 32 +- .../mapper/generate_qa_from_text_mapper.html | 30 +- .../ops/mapper/image_blur_mapper.html | 28 +- .../image_captioning_from_gpt4v_mapper.html | 33 +- .../ops/mapper/image_captioning_mapper.html | 28 +- .../ops/mapper/image_diffusion_mapper.html | 28 +- .../ops/mapper/image_face_blur_mapper.html | 28 +- .../ops/mapper/image_tagging_mapper.html | 28 +- .../ops/mapper/nlpaug_en_mapper.html | 28 +- .../ops/mapper/nlpcda_zh_mapper.html | 28 +- .../ops/mapper/optimize_qa_mapper.html | 32 +- .../ops/mapper/optimize_query_mapper.html | 26 +- .../ops/mapper/optimize_response_mapper.html | 26 +- .../ops/mapper/pair_preference_mapper.html | 32 +- .../punctuation_normalization_mapper.html | 28 +- .../ops/mapper/python_file_mapper.html | 30 +- .../ops/mapper/python_lambda_mapper.html | 30 +- .../ops/mapper/relation_identity_mapper.html | 30 +- .../mapper/remove_bibliography_mapper.html | 28 +- .../ops/mapper/remove_comments_mapper.html | 28 +- .../ops/mapper/remove_header_mapper.html | 28 +- .../ops/mapper/remove_long_words_mapper.html | 30 +- .../remove_non_chinese_character_mapper.html | 28 +- .../remove_repeat_sentences_mapper.html | 33 +- .../mapper/remove_specific_chars_mapper.html | 28 +- .../ops/mapper/remove_table_text_mapper.html | 28 +- ...ords_with_incorrect_substrings_mapper.html | 30 +- .../ops/mapper/replace_content_mapper.html | 28 +- .../ops/mapper/sentence_split_mapper.html | 28 +- .../ops/mapper/text_chunk_mapper.html | 32 +- .../video_captioning_from_audio_mapper.html | 28 +- .../video_captioning_from_frames_mapper.html | 28 +- ...deo_captioning_from_summarizer_mapper.html | 28 +- .../video_captioning_from_video_mapper.html | 28 +- .../mapper/video_extract_frames_mapper.html | 28 +- .../ops/mapper/video_face_blur_mapper.html | 28 +- .../mapper/video_ffmpeg_wrapped_mapper.html | 28 +- .../mapper/video_remove_watermark_mapper.html | 28 +- .../video_resize_aspect_ratio_mapper.html | 33 +- .../video_resize_resolution_mapper.html | 28 +- .../video_split_by_duration_mapper.html | 35 +- .../video_split_by_key_frame_mapper.html | 35 +- .../mapper/video_split_by_scene_mapper.html | 33 +- .../video_tagging_from_audio_mapper.html | 28 +- .../video_tagging_from_frames_mapper.html | 28 +- .../whitespace_normalization_mapper.html | 28 +- _modules/data_juicer/ops/op_fusion.html | 332 ++ .../frequency_specified_field_selector.html | 28 +- .../ops/selector/random_selector.html | 28 +- .../range_specified_field_selector.html | 28 +- .../topk_specified_field_selector.html | 28 +- _modules/data_juicer/utils/asset_utils.html | 170 + .../data_juicer/utils/auto_install_utils.html | 221 + _modules/data_juicer/utils/cache_utils.html | 188 + _modules/data_juicer/utils/ckpt_utils.html | 270 + _modules/data_juicer/utils/common_utils.html | 277 + _modules/data_juicer/utils/compress.html | 690 +++ _modules/data_juicer/utils/constant.html | 399 ++ _modules/data_juicer/utils/file_utils.html | 345 ++ .../data_juicer/utils/fingerprint_utils.html | 281 + _modules/data_juicer/utils/lazy_loader.html | 184 + _modules/data_juicer/utils/logger_utils.html | 307 ++ _modules/data_juicer/utils/mm_utils.html | 1165 ++++ _modules/data_juicer/utils/model_utils.html | 982 ++++ _modules/data_juicer/utils/process_utils.html | 237 + _modules/data_juicer/utils/registry.html | 258 + .../data_juicer/utils/resource_utils.html | 173 + .../data_juicer/utils/unittest_utils.html | 263 + _modules/index.html | 47 +- _sources/data_juicer.analysis.rst.txt | 60 +- _sources/data_juicer.config.rst.txt | 20 +- _sources/data_juicer.core.rst.txt | 84 +- _sources/data_juicer.format.rst.txt | 84 +- _sources/data_juicer.ops.aggregator.rst.txt | 36 +- _sources/data_juicer.ops.common.rst.txt | 28 +- _sources/data_juicer.ops.deduplicator.rst.txt | 84 +- _sources/data_juicer.ops.filter.rst.txt | 364 +- _sources/data_juicer.ops.grouper.rst.txt | 28 +- _sources/data_juicer.ops.mapper.rst.txt | 524 +- _sources/data_juicer.ops.rst.txt | 50 +- _sources/data_juicer.ops.selector.rst.txt | 44 +- _sources/data_juicer.rst.txt | 23 +- _sources/data_juicer.tools.rst.txt | 9 +- _sources/data_juicer.utils.rst.txt | 164 +- _static/basic.css | 15 +- _static/doctools.js | 7 + _static/language_data.js | 7 + _static/searchtools.js | 38 +- data_juicer.analysis.html | 523 +- data_juicer.config.html | 216 +- data_juicer.core.html | 1012 +++- data_juicer.format.html | 616 ++- data_juicer.html | 1208 +++- data_juicer.ops.aggregator.html | 330 +- data_juicer.ops.common.html | 247 +- data_juicer.ops.deduplicator.html | 609 +- data_juicer.ops.filter.html | 3044 +++++++++- data_juicer.ops.grouper.html | 139 +- data_juicer.ops.html | 2675 ++++++++- data_juicer.ops.mapper.html | 3666 +++++++++++- data_juicer.ops.selector.html | 280 +- data_juicer.tools.html | 33 +- data_juicer.utils.html | 2331 +++++++- genindex.html | 4891 +++++++++++++++-- index.html | 381 +- modules.html | 116 +- objects.inv | Bin 7512 -> 16752 bytes py-modindex.html | 907 ++- search.html | 22 +- searchindex.js | 2 +- 216 files changed, 34296 insertions(+), 3579 deletions(-) create mode 100644 _modules/data_juicer/analysis/collector.html create mode 100644 _modules/data_juicer/analysis/draw.html create mode 100644 _modules/data_juicer/analysis/measure.html create mode 100644 _modules/data_juicer/core/ray_data.html create mode 100644 _modules/data_juicer/core/ray_executor.html create mode 100644 _modules/data_juicer/ops/op_fusion.html create mode 100644 _modules/data_juicer/utils/asset_utils.html create mode 100644 _modules/data_juicer/utils/auto_install_utils.html create mode 100644 _modules/data_juicer/utils/cache_utils.html create mode 100644 _modules/data_juicer/utils/ckpt_utils.html create mode 100644 _modules/data_juicer/utils/common_utils.html create mode 100644 _modules/data_juicer/utils/compress.html create mode 100644 _modules/data_juicer/utils/constant.html create mode 100644 _modules/data_juicer/utils/file_utils.html create mode 100644 _modules/data_juicer/utils/fingerprint_utils.html create mode 100644 _modules/data_juicer/utils/lazy_loader.html create mode 100644 _modules/data_juicer/utils/logger_utils.html create mode 100644 _modules/data_juicer/utils/mm_utils.html create mode 100644 _modules/data_juicer/utils/model_utils.html create mode 100644 _modules/data_juicer/utils/process_utils.html create mode 100644 _modules/data_juicer/utils/registry.html create mode 100644 _modules/data_juicer/utils/resource_utils.html create mode 100644 _modules/data_juicer/utils/unittest_utils.html diff --git a/.buildinfo b/.buildinfo index 7a3a9d48e..3940ce529 100644 --- a/.buildinfo +++ b/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 -# This file records the configuration used when building these files. When it is not found, a full rebuild will be done. -config: d21389c0a148f57cab87e3135f4aa3e2 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: 02acd820f6eb43d6f533ae13ad9142b0 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/_modules/data_juicer.html b/_modules/data_juicer.html index 1fc60dc18..9ddcf12c4 100644 --- a/_modules/data_juicer.html +++ b/_modules/data_juicer.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@ diff --git a/_modules/data_juicer/analysis/collector.html b/_modules/data_juicer/analysis/collector.html new file mode 100644 index 000000000..f05c53043 --- /dev/null +++ b/_modules/data_juicer/analysis/collector.html @@ -0,0 +1,188 @@ + + + + + + + + data_juicer.analysis.collector — data_juicer 1.0.2 documentation + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for data_juicer.analysis.collector

+from itertools import chain
+
+from data_juicer.format import load_formatter
+from data_juicer.utils.lazy_loader import LazyLoader
+
+torch = LazyLoader('torch', 'torch')
+transformers = LazyLoader('transformers', 'transformers')
+
+
+
+[docs] +class TextTokenDistCollector(object): + """Tokenize and collect distribution of tokens for given + dataset with a specified tokenizer. + """ + +
+[docs] + def __init__(self, tokenizer): + """ + Initialization method. + + :param tokenizer: tokenizer name on huggingface + """ + self.tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer, trust_remote_code=True) + self.vocab_size = len(self.tokenizer)
+ + +
+[docs] + def collect(self, + data_path, + text_key, + num_proc=1) -> 'torch.distributions.Categorical': + """ + Tokenize and collect tokens distribution of input dataset + :param data_path: path to input dataset. + :param text_key: field keys that will be considered into token counts. + :param num_proc: number of processes to count tokens. + :return: token distribution. + """ + + formatter = load_formatter(data_path) + dataset = formatter.load_dataset(num_proc=num_proc) + assert text_key in dataset.features, f'[{text_key} not find in dataset' + + def prepare_tokenizer( + tokenizer, + text_key, + ): + """ + Prepare a tokenizer function for dataset. + :param tokenizer: a tokenizer to tokenize sample. + :param text_key: field keys that will be + considered into token counts. + """ + + def _tokenize_fn(example, ): + example = tokenizer(example[text_key], + add_special_tokens=False) + return example + + return _tokenize_fn + + tokenize_proc = prepare_tokenizer(self.tokenizer, text_key) + dataset = dataset.map(tokenize_proc, + num_proc=num_proc, + desc=f'tokenize {data_path.split("/")[-1]}') + + token_count = torch.zeros(self.vocab_size, dtype=torch.int64) + token_ids = torch.tensor( + list(chain.from_iterable(dataset['input_ids']))) + indices, counts = token_ids.unique(return_counts=True) + token_count.scatter_(0, indices, counts.to(token_count.dtype)) + dist = torch.distributions.Categorical(token_count) + return dist
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2024, Data-Juicer Team.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/data_juicer/analysis/column_wise_analysis.html b/_modules/data_juicer/analysis/column_wise_analysis.html index 00680847c..88f2ba97f 100644 --- a/_modules/data_juicer/analysis/column_wise_analysis.html +++ b/_modules/data_juicer/analysis/column_wise_analysis.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@ @@ -90,6 +90,8 @@

Source code for data_juicer.analysis.column_wise_analysis

from .overall_analysis import OverallAnalysis +
+[docs] def get_row_col(total_num, factor=2): """ Given the total number of stats figures, get the "best" number of rows and @@ -128,16 +130,17 @@

Source code for data_juicer.analysis.column_wise_analysis

for i in range(total_num): grids.append((i // now_col, i % now_col)) - return int(now_row), int(now_col), grids + return int(now_row), int(now_col), grids
+
-[docs] +[docs] class ColumnWiseAnalysis: """Apply analysis on each column of stats respectively."""
-[docs] +[docs] def __init__(self, dataset, output_path, @@ -173,7 +176,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def analyze(self, show_percentiles=False, show=False, skip_export=False): """ Apply analysis and draw the analysis figure for stats. @@ -291,7 +294,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def draw_hist(self, ax, data, save_path, percentiles=None, show=False): """ Draw the histogram for the data. @@ -352,7 +355,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def draw_box(self, ax, data, save_path, percentiles=None, show=False): """ Draw the box plot for the data. @@ -403,7 +406,7 @@

Source code for data_juicer.analysis.column_wise_analysis

-[docs] +[docs] def draw_wordcloud(self, ax, data, save_path, show=False): word_list = data.tolist() word_nums = {} diff --git a/_modules/data_juicer/analysis/diversity_analysis.html b/_modules/data_juicer/analysis/diversity_analysis.html index 6556ae401..104fa13dd 100644 --- a/_modules/data_juicer/analysis/diversity_analysis.html +++ b/_modules/data_juicer/analysis/diversity_analysis.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@
@@ -88,6 +88,8 @@

Source code for data_juicer.analysis.diversity_analysis

# Modify from self_instruct, please refer to # https://github.com/yizhongw/self-instruct/blob/main/self_instruct/instruction_visualize.ipynb +
+[docs] def find_root_verb_and_its_dobj(tree_root): """ Find the verb and its object closest to the root. @@ -108,11 +110,14 @@

Source code for data_juicer.analysis.diversity_analysis

for child in tree_root.children: return find_root_verb_and_its_dobj(child) # if no children satisfy the condition, return None - return None, None + return None, None
+ # Modify from self_instruct, please refer to # https://github.com/yizhongw/self-instruct/blob/main/self_instruct/instruction_visualize.ipynb +
+[docs] def find_root_verb_and_its_dobj_in_string(nlp, s, first_sent=True): """ Find the verb and its object closest to the root of lexical tree of input @@ -131,9 +136,12 @@

Source code for data_juicer.analysis.diversity_analysis

verb, noun = find_root_verb_and_its_dobj(sent.root) if first_sent or (verb is not None and noun is not None): return verb, noun - return None, None + return None, None
+ +
+[docs] def get_diversity(dataset, top_k_verbs=20, top_k_nouns=4, **kwargs): """ Given the lexical tree analysis result, return the diversity results. @@ -158,17 +166,18 @@

Source code for data_juicer.analysis.diversity_analysis

df = df.groupby('verb').apply(lambda x: x.sort_values( 'count', ascending=False).head(top_k_nouns)).reset_index(drop=True) - return df + return df
+
-[docs] +[docs] class DiversityAnalysis: """Apply diversity analysis for each sample and get an overall analysis result."""
-[docs] +[docs] def __init__(self, dataset, output_path, lang_or_model='en'): """Initialization method :param dataset: the dataset to be analyzed :param output_path: path to store the analysis results :param @@ -183,7 +192,7 @@

Source code for data_juicer.analysis.diversity_analysis

-[docs] +[docs] def compute(self, lang_or_model=None, column_name='text'): """ Apply lexical tree analysis on each sample. @@ -217,7 +226,7 @@

Source code for data_juicer.analysis.diversity_analysis

-[docs] +[docs] def analyze(self, lang_or_model=None, column_name='text', diff --git a/_modules/data_juicer/analysis/draw.html b/_modules/data_juicer/analysis/draw.html new file mode 100644 index 000000000..5a7cc7a3e --- /dev/null +++ b/_modules/data_juicer/analysis/draw.html @@ -0,0 +1,154 @@ + + + + + + + + data_juicer.analysis.draw — data_juicer 1.0.2 documentation + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for data_juicer.analysis.draw

+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+
+
+
+[docs] +def draw_heatmap(data, xlabels, ylables=None, figsize=None, triangle=False): + """ + Draw heatmap of input data with special lables. + + :param data: input data, now support + [`list`, `tuple`, `numpy array`, 'torch tensor'] + :param xlabels: x axis labels. + :param ylabels: y axis labels, if None, use xlabels. + :param figsize: figure size. + :param triangle: only display triangle. + :return: a plot figure. + """ + figsize = figsize if figsize else (8 * 2.5, 6 * 2.5) + _, ax = plt.subplots(figsize=figsize) + mask = None + if triangle: + mask = np.triu(np.ones_like(data)) + ax.tick_params( + right=True, + top=True, + labelright=True, + labeltop=True, + ) + sns.heatmap(data, + ax=ax, + cmap='Oranges', + annot=True, + mask=mask, + linewidths=.05, + square=True, + xticklabels=xlabels, + yticklabels=ylables, + annot_kws={'size': 8}) + plt.subplots_adjust(left=.1, right=0.95, bottom=0.22, top=0.95) + fig = plt.gcf() + plt.show() + return fig
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2024, Data-Juicer Team.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/data_juicer/analysis/measure.html b/_modules/data_juicer/analysis/measure.html new file mode 100644 index 000000000..61d48df0f --- /dev/null +++ b/_modules/data_juicer/analysis/measure.html @@ -0,0 +1,372 @@ + + + + + + + + data_juicer.analysis.measure — data_juicer 1.0.2 documentation + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for data_juicer.analysis.measure

+import numpy as np
+
+from data_juicer.utils.lazy_loader import LazyLoader
+
+torch = LazyLoader('torch', 'torch')
+td = LazyLoader('td', 'torch.distributions')
+F = LazyLoader('F', 'torch.nn.functional')
+
+stats = LazyLoader('stats', 'scipy.stats')
+
+
+
+[docs] +class Measure(object): + """Base class for Measure distribution. + """ + name = 'base' + +
+[docs] + def measure(self, *args, **kwargs): + pass
+ + + def __call__(self, *args, **kwargs): + return self.measure(*args, **kwargs) + + def _convert_to_tensor(self, p): + """ + Convert input data to torch tensor. + :param p: input data, now support + [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`]. + :return: torch tensor + """ + if isinstance(p, torch.Tensor): + return p + elif isinstance(p, td.Categorical): + return p.probs + elif isinstance(p, str): + return torch.load(p) + else: + return torch.tensor(p) + + def _convert_to_categorical(self, p): + """ + Convert input data to torch Categorical. + :param p: input data, now support + [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`]. + :return: torch Categorical + """ + if isinstance(p, td.Categorical): + return p + elif isinstance(p, torch.Tensor): + return td.Categorical(p) + elif isinstance(p, str): + return td.Categorical(torch.load(p)) + else: + return td.Categorical(torch.tensor(p)) + + def _convert_to_ndarray(self, p): + """ + Convert input data to torch tensor. + :param p: input data, now support + [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`]. + :return: torch tensor + """ + return self._convert_to_tensor(p).numpy()
+ + + +
+[docs] +class KLDivMeasure(Measure): + """ + Measure Kullback-Leibler divergence. + """ + name = 'kl_divergence' + +
+[docs] + def measure(self, p, q): + p = self._convert_to_categorical(p) + q = self._convert_to_categorical(q) + assert p.probs.shape == q.probs.shape, \ + 'The two inputs have different shape:' \ + f'{p.probs.shape} != {q.probs.shape} in {self.name}' + return F.kl_div(q.logits, p.probs, log_target=False, reduction='sum')
+
+ + + +
+[docs] +class JSDivMeasure(Measure): + """ + Measure Jensen-Shannon divergence. + """ + name = 'js_divergence' + +
+[docs] + def measure(self, p, q): + p = self._convert_to_tensor(p) + q = self._convert_to_tensor(q) + assert p.shape == q.shape, \ + 'The two inputs have different shape:' \ + f'{p.shape} != {q.shape} in {self.name}' + + m = 0.5 * (p + q) + kl_p = KLDivMeasure()(p, m) + kl_q = KLDivMeasure()(q, m) + js = 0.5 * (kl_p + kl_q) + return js
+
+ + + +
+[docs] +class CrossEntropyMeasure(Measure): + """ + Measure Cross-Entropy. + """ + name = 'cross_entropy' + +
+[docs] + def measure(self, p, q): + p = self._convert_to_categorical(p) + q = self._convert_to_categorical(q) + assert p.probs.shape == q.probs.shape, \ + 'The two inputs have different shape: '\ + f'{p.probs.shape} != {q.probs.shape} in {self.name}' + return F.cross_entropy(q.logits, p.probs, reduction='sum')
+
+ + + +
+[docs] +class EntropyMeasure(Measure): + """ + Measure Entropy. + """ + name = 'entropy' + +
+[docs] + def measure(self, p): + p = self._convert_to_categorical(p) + return p.entropy()
+
+ + + +
+[docs] +class RelatedTTestMeasure(Measure): + """ + Measure T-Test for two related distributions on their histogram of the same + bins. + + Ref: + https://en.wikipedia.org/wiki/Student%27s_t-test + + For continuous features or distributions, the input could be dataset stats + list. + For discrete features or distributions, the input could be the tags or the + categories list. + """ + name = 't-test' + +
+[docs] + @staticmethod + def stats_to_hist(p, q): + p = np.array(p) + q = np.array(q) + + # get common maximum number of data samples, and max/min values + max_data_num = max(len(p), len(q)) + min_val = min(min(p), min(q)) + max_val = max(max(p), max(q)) + + # get a recommended number of bins + rec_bins = max(int(np.sqrt(max_data_num)), 10) + + # get the common bin edges + common_p = np.append(p, [min_val, max_val]) + hist_p, bin_edges = np.histogram(common_p, bins=rec_bins) + # restore the hist of the original p + hist_p[0] -= 1 + hist_p[-1] -= 1 + # get the hist of the original q using the common bin edges + hist_q, _ = np.histogram(q, bins=bin_edges) + return hist_p, hist_q, bin_edges
+ + +
+[docs] + @staticmethod + def category_to_hist(p, q): + + def flatten_list(lst): + res = [] + for s in lst: + if isinstance(s, list): + res.extend(flatten_list(s)) + else: + res.append(s) + return res + + # flatten the list + p = flatten_list(p) + q = flatten_list(q) + + # get the common categories + cat_p = set(p) + cat_q = set(q) + cat_common = cat_p.union(cat_q) + + # get category distributions + count_p = {cat: 0 for cat in cat_common} + count_q = {cat: 0 for cat in cat_common} + for cat in p: + count_p[cat] += 1 + for cat in q: + count_q[cat] += 1 + + # only keep distribution values sorted by counts + sorted_cat = list(count_p.items()) + sorted_cat.sort(key=lambda it: it[1], reverse=True) + sorted_cat = [it[0] for it in sorted_cat] + # get the value dist + hist_p = [count_p[cat] for cat in sorted_cat] + hist_q = [count_q[cat] for cat in sorted_cat] + + return hist_p, hist_q, count_p, count_q, sorted_cat
+ + +
+[docs] + def measure(self, p, q): + """ + :param p: the first feature or distribution. (stats/tags/categories) + :param q: the second feature or distribution. (stats/tags/categories) + :return: the T-Test results object -- ([ref](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats._result_classes.TtestResult.html#scipy.stats._result_classes.TtestResult)) # noqa: E501 + """ + ele = p[0] + while isinstance(ele, list): + ele = ele[0] + if isinstance(ele, str): + # discrete tags or categories + hist_p, hist_q = self.category_to_hist(p, q)[:2] + else: + # continuous stats + hist_p, hist_q = self.stats_to_hist(p, q)[:2] + + # compute the t-test and pval for hist_p and hist_q + ttest_res = stats.ttest_rel(hist_p, hist_q) + return ttest_res
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2024, Data-Juicer Team.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/data_juicer/analysis/overall_analysis.html b/_modules/data_juicer/analysis/overall_analysis.html index b50c3cc6b..e86453abe 100644 --- a/_modules/data_juicer/analysis/overall_analysis.html +++ b/_modules/data_juicer/analysis/overall_analysis.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@
@@ -93,13 +93,13 @@

Source code for data_juicer.analysis.overall_analysis

-[docs] +[docs] class OverallAnalysis: """Apply analysis on the overall stats, including mean, std, quantiles, etc."""
-[docs] +[docs] def __init__(self, dataset, output_path): """ Initialization method. @@ -129,7 +129,7 @@

Source code for data_juicer.analysis.overall_analysis

-[docs] +[docs] def refine_single_column(self, col): if col.dtype != 'object': # not an object, return directly @@ -152,7 +152,7 @@

Source code for data_juicer.analysis.overall_analysis

-[docs] +[docs] def analyze(self, percentiles=[], num_proc=1, skip_export=False): """ Apply overall analysis on the whole dataset based on the describe diff --git a/_modules/data_juicer/config/config.html b/_modules/data_juicer/config/config.html index 9adf1de29..283b02b33 100644 --- a/_modules/data_juicer/config/config.html +++ b/_modules/data_juicer/config/config.html @@ -11,7 +11,7 @@ - + @@ -39,16 +39,16 @@
@@ -103,7 +103,7 @@

Source code for data_juicer.config.config

 
 
 
-[docs] +[docs] def init_configs(args: Optional[List[str]] = None, which_entry: object = None): """ initialize the jsonargparse parser and parse configs from one of: @@ -481,6 +481,8 @@

Source code for data_juicer.config.config

 
 
 
+