Skip to content

Commit

Permalink
Stats saving path option for filter op (#309)
Browse files Browse the repository at this point in the history
* add stats saving path option for filter op

* modify test assertion functions
  • Loading branch information
garyzhang99 authored May 7, 2024
1 parent a714f64 commit 453db43
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 0 deletions.
3 changes: 3 additions & 0 deletions data_juicer/core/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,9 @@ def run(self, load_data_np=None):
desc=op_name + '_compute_stats')
if self.cfg.use_checkpoint:
prev = dataset
if op.stats_export_path is not None:
self.exporter.export_compute_stats(
dataset, op.stats_export_path)
tmp = dataset.filter(op.process,
num_proc=self.cfg.np,
desc=op_name + '_process')
Expand Down
12 changes: 12 additions & 0 deletions data_juicer/core/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,18 @@ def export(self, dataset):
self._export_impl(dataset, self.export_path, self.suffix,
self.export_stats)

def export_compute_stats(self, dataset, export_path):
"""
Export method for saving compute status in filters
"""
keep_stats_in_res_ds = self.keep_stats_in_res_ds
self.keep_stats_in_res_ds = True
self._export_impl(dataset,
export_path,
self.suffix,
export_stats=False)
self.keep_stats_in_res_ds = keep_stats_in_res_ds

@staticmethod
def to_jsonl(dataset, export_path, num_proc=1, **kwargs):
"""
Expand Down
3 changes: 3 additions & 0 deletions data_juicer/core/ray_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ def process_batch_arrow(table: pa.Table) -> pa.Table:
else:
dataset = dataset.map(op.compute_stats,
num_gpus=num_gpus)
if op.stats_export_path is not None:
dataset.write_json(op.stats_export_path,
force_ascii=False)
dataset = dataset.filter(op.process)
else:
logger.error(
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/base_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ def __init__(self, *args, **kwargs):

from data_juicer.core.data import wrap_func_with_nested_access
self.compute_stats = wrap_func_with_nested_access(self.compute_stats)
self.stats_export_path = kwargs.get('stats_export_path', None)

def compute_stats(self, sample, context=False):
"""
Expand Down
6 changes: 6 additions & 0 deletions tests/config/test_config_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def test_yaml_cfg_file(self):
'video_key': 'videos',
'accelerator': 'cpu',
'spec_numprocs': 0,
'stats_export_path': None,
'cpu_required': 1,
'mem_required': 0,
'use_actor': False,
Expand Down Expand Up @@ -130,6 +131,7 @@ def test_mixture_cfg(self):
'video_key': 'videos',
'accelerator': 'cpu',
'spec_numprocs': 0,
'stats_export_path': None,
'cpu_required': 1,
'mem_required': 0,
'use_actor': False,
Expand All @@ -146,6 +148,7 @@ def test_mixture_cfg(self):
'video_key': 'videos',
'accelerator': 'cpu',
'spec_numprocs': 0,
'stats_export_path': None,
'cpu_required': 1,
'mem_required': 0,
'use_actor': False,
Expand All @@ -162,6 +165,7 @@ def test_mixture_cfg(self):
'video_key': 'videos',
'accelerator': 'cpu',
'spec_numprocs': 0,
'stats_export_path': None,
'cpu_required': 1,
'mem_required': 0,
'use_actor': False,
Expand All @@ -178,6 +182,7 @@ def test_mixture_cfg(self):
'video_key': 'videos',
'accelerator': 'cpu',
'spec_numprocs': 0,
'stats_export_path': None,
'cpu_required': 1,
'mem_required': 0,
'use_actor': False,
Expand All @@ -194,6 +199,7 @@ def test_mixture_cfg(self):
'video_key': 'videos',
'accelerator': 'cpu',
'spec_numprocs': 0,
'stats_export_path': None,
'cpu_required': 1,
'mem_required': 0,
'use_actor': False,
Expand Down

0 comments on commit 453db43

Please sign in to comment.