-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdvc.yaml
96 lines (96 loc) · 2.66 KB
/
dvc.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
metrics:
- data/metrics.json
stages:
fetch-metadata:
cmd: uv run scripts/fetch_eidc_metadata.py ${files.metadata} -s ${sub-sample}
deps:
- scripts/fetch_eidc_metadata.py
outs:
- ${files.metadata}
fetch-supporting-docs:
cmd: uv run scripts/fetch_supporting_docs.py ${files.metadata} ${files.supporting-docs}
deps:
- ${files.metadata}
- scripts/fetch_supporting_docs.py
outs:
- ${files.supporting-docs}
extract-metadata:
cmd: uv run scripts/extract_metadata.py ${files.metadata} ${files.extracted}
deps:
- ${files.metadata}
- scripts/extract_metadata.py
outs:
- ${files.extracted}
chunk-data:
cmd: >-
uv run scripts/chunk_data.py
-o ${files.chunked}
-c ${hp.chunk-size}
-ol ${hp.overlap}
${files.extracted}
${files.supporting-docs}
-m ${max-length}
deps:
- ${files.extracted}
- ${files.supporting-docs}
- scripts/chunk_data.py
outs:
- ${files.chunked}
create-embeddings:
cmd: uv run scripts/create_embeddings.py ${files.chunked} ${files.embeddings} -m ${hp.embeddings-model} -u
deps:
- ${files.chunked}
- scripts/create_embeddings.py
outs:
- ${files.embeddings}
upload-to-docstore:
cmd: >-
uv run scripts/upload_to_docstore.py
${files.embeddings}
-o ${doc-store.files}
-em ${hp.embeddings-model}
-c ${doc-store.collection}
deps:
- ${files.embeddings}
- scripts/upload_to_docstore.py
outs:
- ${files.doc-store}
generate-testset:
cmd: uv run scripts/generate_synthetic_testset.py ${files.extracted} ${files.test-set} ${test-set-size}
deps:
- ${files.extracted}
- scripts/generate_synthetic_testset.py
outs:
- ${files.test-set}
evaluate-synthetic-testset:
cmd: uv run scripts/evaluate_synthetic_testset.py ${files.test-set} ${files.cleaned-test-set}
deps:
- ${files.test-set}
- scripts/evaluate_synthetic_testset.py
outs:
- ${files.cleaned-test-set}
run-rag-pipeline:
cmd: >-
uv run scripts/run_rag_pipeline.py
-i ${files.cleaned-test-set}
-o ${files.eval-set}
-ds ${files.doc-store}
-c ${doc-store.collection}
-m ${rag.model}
-p ${files.pipeline}
deps:
- ${files.cleaned-test-set}
- ${files.doc-store}
- scripts/run_rag_pipeline.py
outs:
- ${files.eval-set}
- ${files.pipeline}
evaluate:
cmd: uv run scripts/evaluate.py ${files.eval-set} -m ${files.metrics} -img ${files.eval-plot} -r ${files.results}
deps:
- ${files.eval-set}
- scripts/evaluate.py
outs:
- ${files.metrics}
- ${files.eval-plot}
- ${files.results}