-
Notifications
You must be signed in to change notification settings - Fork 18
123 lines (106 loc) · 4.24 KB
/
preprocess_manual.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
name: Preprocessing of specified datasets
on:
workflow_dispatch:
inputs:
datasets:
description: 'Datasets to preprocess, separated by space. Example: 0001 0002 0044'
required: True
type: string
do_standardize:
description: 'Whether to standardize compounds'
type: boolean
default: true
do_classyfire:
description: 'Whether to compute ClassyFire classes'
type: boolean
default: true
do_descriptors:
description: 'Whether to compute descriptors'
type: boolean
default: true
do_fingerprints:
description: 'Whether to compute fingerprints'
type: boolean
default: true
do_metadata:
description: 'Whether to standardize metadata'
type: boolean
default: true
do_validation:
description: 'Whether to run validation procedures'
type: boolean
default: true
jobs:
preprocess:
name: Preprocess raw data
runs-on: ubuntu-latest
container:
image: ghcr.io/${{ github.repository_owner }}/repo_rt_preprocessing:latest
env:
RENV_PATHS_LIBRARY: '/renv/library'
defaults:
run:
shell: bash
steps:
- name: Checkout repository
uses: actions/checkout@v3
with:
lfs: true
- name: List all files for selected datasets
run: |
for f in ${{ inputs.datasets }}; do
ls -lh raw_data/$f || true
ls -lh processed_data/$f || true
done
continue-on-error: true
- name: check renv
run: Rscript -e "renv::status()"
- name: Standardize compounds
run: Rscript scripts/R_ci/compounds_standardize.R ${{ inputs.datasets }}
if: ${{ inputs.do_standardize }}
- name: Compounds classyfire classes
run: Rscript scripts/R_ci/compounds_classyfire.R ${{ inputs.datasets }}
if: ${{ inputs.do_classyfire }}
- name: Compounds descriptors
run: Rscript scripts/R_ci/compounds_descriptors.R ${{ inputs.datasets }}
if: ${{ inputs.do_descriptors }}
- name: Compounds fingerprints
run: Rscript scripts/R_ci/compounds_fingerprints.R ${{ inputs.datasets }}
if: ${{ inputs.do_fingerprints }}
- name: Metadata standardization
run: Rscript scripts/R_ci/metadata_standardize.R ${{ inputs.datasets }}
if: ${{ inputs.do_metadata }}
- name: Generate dataset reports
run: Rscript scripts/R_ci/compounds_overview.R ${{ inputs.datasets }}
- name: Verify that required files are present
run: Rscript scripts/R_ci/files_complete.R ${{ inputs.datasets }}
- name: Update overview table of all datasets
run: python3 scripts/Python/datasets_overview.py
continue-on-error: true
- name: QSPR-based validation
run: python3 scripts/Python/validation_qspr.py ${{ inputs.datasets }}
continue-on-error: true
if: ${{ inputs.do_validation }}
- name: Retention order-based validation for datasets with nominally identical setups
run: python3 scripts/Python/validation_order.py --mode same_condition ${{ inputs.datasets }}
continue-on-error: true
if: ${{ inputs.do_validation }}
- name: Retention order-based validation for datasets of systematic measurements
run: python3 scripts/Python/validation_order.py --mode systematic ${{ inputs.datasets }}
continue-on-error: true
if: ${{ inputs.do_validation }}
- name: Commit preprocessing
run: |
git config --global user.email '[email protected]'
git config --global user.name 'Github Actions'
# because of dockerized environment, git will otherwise complain about "dubious ownership of directory"
git config --global safe.directory '*'
git add processed_data raw_data
git commit -m "Preprocessing ${{ inputs.datasets }}" -m "Tasks:
- standardize compounds: ${{ inputs.do_standardize }}
- compute classyfire classes: ${{ inputs.do_classyfire }}
- compute descriptors: ${{ inputs.do_descriptors }}
- compute fingerprints: ${{ inputs.do_fingerprints }}
- standardize metadata: ${{ inputs.do_metadata }}"
git lfs push origin HEAD # first push LFS, otherwise failure because of lfs.url
git push origin HEAD