-
Notifications
You must be signed in to change notification settings - Fork 195
/
1_single_op_pipline.yaml
68 lines (63 loc) · 2.77 KB
/
1_single_op_pipline.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Sandbox config example
# global parameters
project_name: 'demo-bench'
experiment_name: 'single_op_language_score' # for wandb tracer name
work_dir: './outputs/demo-bench' # the default output dir for meta logging
# configs for each job, the jobs will be executed according to the order in the list
probe_job_configs:
# get statistics value for each sample and get the distribution analysis for given percentiles
- hook: 'ProbeViaAnalyzerHook'
meta_name: 'analysis_ori_data'
dj_configs:
project_name: 'demo-bench'
dataset_path: './demos/data/demo-dataset-videos.jsonl' # path to your dataset directory or file
percentiles: [0.333, 0.667] # percentiles to analyze the dataset distribution
export_path: './outputs/demo-bench/demo-dataset-with-language-score.jsonl'
export_original_dataset: true # must be true to keep statistics values with dataset
process:
- language_id_score_filter:
lang: 'zh'
min_score: 0.8
extra_configs:
refine_recipe_job_configs:
execution_job_configs:
# sample the splits with low/middle/high statistics values
- hook: 'ProcessDataHook'
meta_name:
dj_configs:
project_name: 'demo-bench'
dataset_path: './outputs/demo-bench/demo-dataset-with-language-score.jsonl' # output dataset of probe jobs
export_path: './outputs/demo-bench/demo-dataset-with-high-language-score.jsonl'
process:
- range_specified_field_selector:
field_key: '__dj__stats__.lang_score' # '__dj__stats__' the target keys corresponding to multi-level field information need to be separated by '.'. 'dj__stats' is the default location for storing stats in Data Juicer, and 'lang_score' is the stats corresponding to the language_id_score_filter.
lower_percentile: 0.667
upper_percentile: 1.000
extra_configs:
# random sample dataset with fix number of instances
- hook: 'ProcessDataHook'
meta_name:
dj_configs:
project_name: 'demo-bench'
dataset_path: './outputs/demo-bench/demo-dataset-with-high-language-score.jsonl' # output dataset of probe jobs
export_path: './outputs/demo-bench/demo-dataset-for-train.jsonl'
process:
- random_selector:
select_num: 16
extra_configs:
# train model
- hook: 'TrainModelHook'
meta_name:
dj_configs:
extra_configs: './configs/demo/bench/model_train.yaml'
# infer model
- hook: 'InferModelHook'
meta_name:
dj_configs:
extra_configs: './configs/demo/bench/model_infer.yaml'
evaluation_job_configs:
# vbench evaluation
- hook: 'EvaluateDataHook'
meta_name: 'vbench_eval'
dj_configs:
extra_configs: './configs/demo/bench/vbench_eval.yaml'