configs/demo/bench/1_single_op_pipline.yaml

# Sandbox config example

# global parameters
project_name: 'demo-bench'
experiment_name: 'single_op_language_score'              # for wandb tracer name
work_dir: './outputs/demo-bench'                         # the default output dir for meta logging

# configs for each job, the jobs will be executed according to the order in the list
probe_job_configs:
  # get statistics value for each sample and get the distribution analysis for given percentiles
  - hook: 'ProbeViaAnalyzerHook'
    meta_name: 'analysis_ori_data'
    dj_configs:
      project_name: 'demo-bench'
      dataset_path: './demos/data/demo-dataset-videos.jsonl'  # path to your dataset directory or file
      percentiles: [0.333, 0.667]                              # percentiles to analyze the dataset distribution
      export_path: './outputs/demo-bench/demo-dataset-with-language-score.jsonl'
      export_original_dataset: true                            # must be true to keep statistics values with dataset
      process:
        - language_id_score_filter:
            lang: 'zh'
            min_score: 0.8
    extra_configs:

refine_recipe_job_configs:

execution_job_configs:
  # sample the splits with low/middle/high statistics values
  - hook: 'ProcessDataHook'
    meta_name:
    dj_configs:
      project_name: 'demo-bench'
      dataset_path: './outputs/demo-bench/demo-dataset-with-language-score.jsonl' # output dataset of probe jobs
      export_path: './outputs/demo-bench/demo-dataset-with-high-language-score.jsonl'
      process:
        - range_specified_field_selector:
            field_key: '__dj__stats__.lang_score'     # '__dj__stats__' the target keys corresponding to multi-level field information need to be separated by '.'. 'dj__stats' is the default location for storing stats in Data Juicer, and 'lang_score' is the stats corresponding to the language_id_score_filter.
            lower_percentile: 0.667
            upper_percentile: 1.000
    extra_configs:
  # random sample dataset with fix number of instances
  - hook: 'ProcessDataHook'
    meta_name:
    dj_configs:
      project_name: 'demo-bench'
      dataset_path: './outputs/demo-bench/demo-dataset-with-high-language-score.jsonl' # output dataset of probe jobs
      export_path: './outputs/demo-bench/demo-dataset-for-train.jsonl'
      process:
        - random_selector:
            select_num: 16
    extra_configs:
  # train model
  - hook: 'TrainModelHook'
    meta_name:
    dj_configs:
    extra_configs: './configs/demo/bench/model_train.yaml'
  # infer model
  - hook: 'InferModelHook'
    meta_name:
    dj_configs:
    extra_configs: './configs/demo/bench/model_infer.yaml'

evaluation_job_configs:
  # vbench evaluation
  - hook: 'EvaluateDataHook'
    meta_name: 'vbench_eval'
    dj_configs:
    extra_configs: './configs/demo/bench/vbench_eval.yaml'