-
Notifications
You must be signed in to change notification settings - Fork 194
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
216 changed files
with
34,296 additions
and
3,579 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
# Sphinx build info version 1 | ||
# This file records the configuration used when building these files. When it is not found, a full rebuild will be done. | ||
config: d21389c0a148f57cab87e3135f4aa3e2 | ||
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. | ||
config: 02acd820f6eb43d6f533ae13ad9142b0 | ||
tags: 645f666f9bcd5a90fca523b33c5a78b7 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
|
||
|
||
<!DOCTYPE html> | ||
<html class="writer-html5" lang="en" data-content_root="../../../"> | ||
<head> | ||
<meta charset="utf-8" /> | ||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> | ||
<title>data_juicer.analysis.collector — data_juicer 1.0.2 documentation</title> | ||
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=80d5e7a1" /> | ||
<link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=e59714d7" /> | ||
|
||
|
||
<script src="../../../_static/documentation_options.js?v=1ed6394b"></script> | ||
<script src="../../../_static/doctools.js?v=9a2dae69"></script> | ||
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script> | ||
<script src="../../../_static/js/theme.js"></script> | ||
<link rel="index" title="Index" href="../../../genindex.html" /> | ||
<link rel="search" title="Search" href="../../../search.html" /> | ||
</head> | ||
|
||
<body class="wy-body-for-nav"> | ||
<div class="wy-grid-for-nav"> | ||
<nav data-toggle="wy-nav-shift" class="wy-nav-side"> | ||
<div class="wy-side-scroll"> | ||
<div class="wy-side-nav-search" > | ||
|
||
|
||
|
||
<a href="../../../index.html" class="icon icon-home"> | ||
data_juicer | ||
</a> | ||
<div role="search"> | ||
<form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get"> | ||
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" /> | ||
<input type="hidden" name="check_keywords" value="yes" /> | ||
<input type="hidden" name="area" value="default" /> | ||
</form> | ||
</div> | ||
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu"> | ||
<p class="caption" role="heading"><span class="caption-text">API Reference</span></p> | ||
<ul> | ||
<li class="toctree-l1"><a class="reference internal" href="../../../data_juicer.core.html">data_juicer.core package</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="../../../data_juicer.ops.html">data_juicer.ops package</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="../../../data_juicer.ops.filter.html">data_juicer.ops.filter package</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="../../../data_juicer.ops.mapper.html">data_juicer.ops.mapper package</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="../../../data_juicer.ops.deduplicator.html">data_juicer.ops.deduplicator package</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="../../../data_juicer.ops.selector.html">data_juicer.ops.selector package</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="../../../data_juicer.ops.common.html">data_juicer.ops.common package</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="../../../data_juicer.analysis.html">data_juicer.analysis package</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="../../../data_juicer.config.html">data_juicer.config package</a></li> | ||
<li class="toctree-l1"><a class="reference internal" href="../../../data_juicer.format.html">data_juicer.format package</a></li> | ||
</ul> | ||
|
||
</div> | ||
</div> | ||
</nav> | ||
|
||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" > | ||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i> | ||
<a href="../../../index.html">data_juicer</a> | ||
</nav> | ||
|
||
<div class="wy-nav-content"> | ||
<div class="rst-content"> | ||
<div role="navigation" aria-label="Page navigation"> | ||
<ul class="wy-breadcrumbs"> | ||
<li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li> | ||
<li class="breadcrumb-item"><a href="../../index.html">Module code</a></li> | ||
<li class="breadcrumb-item"><a href="../../data_juicer.html">data_juicer</a></li> | ||
<li class="breadcrumb-item active">data_juicer.analysis.collector</li> | ||
<li class="wy-breadcrumbs-aside"> | ||
</li> | ||
</ul> | ||
<hr/> | ||
</div> | ||
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article"> | ||
<div itemprop="articleBody"> | ||
|
||
<h1>Source code for data_juicer.analysis.collector</h1><div class="highlight"><pre> | ||
<span></span><span class="kn">from</span> <span class="nn">itertools</span> <span class="kn">import</span> <span class="n">chain</span> | ||
|
||
<span class="kn">from</span> <span class="nn">data_juicer.format</span> <span class="kn">import</span> <span class="n">load_formatter</span> | ||
<span class="kn">from</span> <span class="nn">data_juicer.utils.lazy_loader</span> <span class="kn">import</span> <span class="n">LazyLoader</span> | ||
|
||
<span class="n">torch</span> <span class="o">=</span> <span class="n">LazyLoader</span><span class="p">(</span><span class="s1">'torch'</span><span class="p">,</span> <span class="s1">'torch'</span><span class="p">)</span> | ||
<span class="n">transformers</span> <span class="o">=</span> <span class="n">LazyLoader</span><span class="p">(</span><span class="s1">'transformers'</span><span class="p">,</span> <span class="s1">'transformers'</span><span class="p">)</span> | ||
|
||
|
||
<div class="viewcode-block" id="TextTokenDistCollector"> | ||
<a class="viewcode-back" href="../../../data_juicer.analysis.html#data_juicer.analysis.collector.TextTokenDistCollector">[docs]</a> | ||
<span class="k">class</span> <span class="nc">TextTokenDistCollector</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span> | ||
<span class="w"> </span><span class="sd">"""Tokenize and collect distribution of tokens for given</span> | ||
<span class="sd"> dataset with a specified tokenizer.</span> | ||
<span class="sd"> """</span> | ||
|
||
<div class="viewcode-block" id="TextTokenDistCollector.__init__"> | ||
<a class="viewcode-back" href="../../../data_juicer.analysis.html#data_juicer.analysis.collector.TextTokenDistCollector.__init__">[docs]</a> | ||
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tokenizer</span><span class="p">):</span> | ||
<span class="w"> </span><span class="sd">"""</span> | ||
<span class="sd"> Initialization method.</span> | ||
|
||
<span class="sd"> :param tokenizer: tokenizer name on huggingface</span> | ||
<span class="sd"> """</span> | ||
<span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span> <span class="o">=</span> <span class="n">transformers</span><span class="o">.</span><span class="n">AutoTokenizer</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span> | ||
<span class="n">tokenizer</span><span class="p">,</span> <span class="n">trust_remote_code</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> | ||
<span class="bp">self</span><span class="o">.</span><span class="n">vocab_size</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="p">)</span></div> | ||
|
||
|
||
<div class="viewcode-block" id="TextTokenDistCollector.collect"> | ||
<a class="viewcode-back" href="../../../data_juicer.analysis.html#data_juicer.analysis.collector.TextTokenDistCollector.collect">[docs]</a> | ||
<span class="k">def</span> <span class="nf">collect</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> | ||
<span class="n">data_path</span><span class="p">,</span> | ||
<span class="n">text_key</span><span class="p">,</span> | ||
<span class="n">num_proc</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="s1">'torch.distributions.Categorical'</span><span class="p">:</span> | ||
<span class="w"> </span><span class="sd">"""</span> | ||
<span class="sd"> Tokenize and collect tokens distribution of input dataset</span> | ||
<span class="sd"> :param data_path: path to input dataset.</span> | ||
<span class="sd"> :param text_key: field keys that will be considered into token counts.</span> | ||
<span class="sd"> :param num_proc: number of processes to count tokens.</span> | ||
<span class="sd"> :return: token distribution.</span> | ||
<span class="sd"> """</span> | ||
|
||
<span class="n">formatter</span> <span class="o">=</span> <span class="n">load_formatter</span><span class="p">(</span><span class="n">data_path</span><span class="p">)</span> | ||
<span class="n">dataset</span> <span class="o">=</span> <span class="n">formatter</span><span class="o">.</span><span class="n">load_dataset</span><span class="p">(</span><span class="n">num_proc</span><span class="o">=</span><span class="n">num_proc</span><span class="p">)</span> | ||
<span class="k">assert</span> <span class="n">text_key</span> <span class="ow">in</span> <span class="n">dataset</span><span class="o">.</span><span class="n">features</span><span class="p">,</span> <span class="sa">f</span><span class="s1">'[</span><span class="si">{</span><span class="n">text_key</span><span class="si">}</span><span class="s1"> not find in dataset'</span> | ||
|
||
<span class="k">def</span> <span class="nf">prepare_tokenizer</span><span class="p">(</span> | ||
<span class="n">tokenizer</span><span class="p">,</span> | ||
<span class="n">text_key</span><span class="p">,</span> | ||
<span class="p">):</span> | ||
<span class="w"> </span><span class="sd">"""</span> | ||
<span class="sd"> Prepare a tokenizer function for dataset.</span> | ||
<span class="sd"> :param tokenizer: a tokenizer to tokenize sample.</span> | ||
<span class="sd"> :param text_key: field keys that will be</span> | ||
<span class="sd"> considered into token counts.</span> | ||
<span class="sd"> """</span> | ||
|
||
<span class="k">def</span> <span class="nf">_tokenize_fn</span><span class="p">(</span><span class="n">example</span><span class="p">,</span> <span class="p">):</span> | ||
<span class="n">example</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="p">(</span><span class="n">example</span><span class="p">[</span><span class="n">text_key</span><span class="p">],</span> | ||
<span class="n">add_special_tokens</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> | ||
<span class="k">return</span> <span class="n">example</span> | ||
|
||
<span class="k">return</span> <span class="n">_tokenize_fn</span> | ||
|
||
<span class="n">tokenize_proc</span> <span class="o">=</span> <span class="n">prepare_tokenizer</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="p">,</span> <span class="n">text_key</span><span class="p">)</span> | ||
<span class="n">dataset</span> <span class="o">=</span> <span class="n">dataset</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">tokenize_proc</span><span class="p">,</span> | ||
<span class="n">num_proc</span><span class="o">=</span><span class="n">num_proc</span><span class="p">,</span> | ||
<span class="n">desc</span><span class="o">=</span><span class="sa">f</span><span class="s1">'tokenize </span><span class="si">{</span><span class="n">data_path</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">"/"</span><span class="p">)[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span><span class="si">}</span><span class="s1">'</span><span class="p">)</span> | ||
|
||
<span class="n">token_count</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vocab_size</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">)</span> | ||
<span class="n">token_ids</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span> | ||
<span class="nb">list</span><span class="p">(</span><span class="n">chain</span><span class="o">.</span><span class="n">from_iterable</span><span class="p">(</span><span class="n">dataset</span><span class="p">[</span><span class="s1">'input_ids'</span><span class="p">])))</span> | ||
<span class="n">indices</span><span class="p">,</span> <span class="n">counts</span> <span class="o">=</span> <span class="n">token_ids</span><span class="o">.</span><span class="n">unique</span><span class="p">(</span><span class="n">return_counts</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> | ||
<span class="n">token_count</span><span class="o">.</span><span class="n">scatter_</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">indices</span><span class="p">,</span> <span class="n">counts</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">token_count</span><span class="o">.</span><span class="n">dtype</span><span class="p">))</span> | ||
<span class="n">dist</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">distributions</span><span class="o">.</span><span class="n">Categorical</span><span class="p">(</span><span class="n">token_count</span><span class="p">)</span> | ||
<span class="k">return</span> <span class="n">dist</span></div> | ||
</div> | ||
|
||
</pre></div> | ||
|
||
</div> | ||
</div> | ||
<footer> | ||
|
||
<hr/> | ||
|
||
<div role="contentinfo"> | ||
<p>© Copyright 2024, Data-Juicer Team.</p> | ||
</div> | ||
|
||
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a | ||
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> | ||
provided by <a href="https://readthedocs.org">Read the Docs</a>. | ||
|
||
|
||
</footer> | ||
</div> | ||
</div> | ||
</section> | ||
</div> | ||
<script> | ||
jQuery(function () { | ||
SphinxRtdTheme.Navigation.enable(true); | ||
}); | ||
</script> | ||
|
||
</body> | ||
</html> |
Oops, something went wrong.