Skip to content

Add dataset dataset_43 #180

Add dataset dataset_43

Add dataset dataset_43 #180

Workflow file for this run

name: Preprocessing
on:
pull_request_target:
# (re)opened PR or new commit in fork
types: [ opened, synchronize, reopened ]
paths:
- 'raw_data/**'
- 'processed_data/**'
jobs:
preprocess:
name: Preprocess raw data
runs-on: ubuntu-latest
container:
image: ghcr.io/${{ github.repository_owner }}/repo_rt_preprocessing:latest
env:
RENV_PATHS_LIBRARY: '/renv/library'
defaults:
run:
shell: bash
steps:
- name: Checkout fork repository
uses: actions/checkout@v3
with:
fetch-depth: 0
repository: ${{github.event.pull_request.head.repo.full_name}}
ref: ${{ github.head_ref }}
lfs: true
- name: Get changed files
id: files
uses: Ana06/[email protected]
- name: Get new/changed datasets
id: filesfolders
shell: bash {0}
run: echo "files=$(for f in ${{ steps.files.outputs.added_modified_renamed }}; do basename $(dirname $f); done | grep -E '^([0-9]+)|(draft_.*)$' | sort | uniq | tr '\n' ' ')" >> $GITHUB_OUTPUT
- name: List all added files
run: |
for f in ${{ steps.filesfolders.outputs.files }}; do
ls -lh raw_data/$f
done
- name: Standardize compounds
run: Rscript scripts/R_ci/compounds_standardize.R ${{ steps.filesfolders.outputs.files }}
- name: Compounds classyfire classes
run: Rscript scripts/R_ci/compounds_classyfire.R ${{ steps.filesfolders.outputs.files }}
- name: Compounds descriptors
run: Rscript scripts/R_ci/compounds_descriptors.R ${{ steps.filesfolders.outputs.files }}
- name: Compounds fingerprints
run: Rscript scripts/R_ci/compounds_fingerprints.R ${{ steps.filesfolders.outputs.files }}
- name: Metadata standardization
run: Rscript scripts/R_ci/metadata_standardize.R ${{ steps.filesfolders.outputs.files }}
- name: Generate dataset reports
run: Rscript scripts/R_ci/compounds_overview.R ${{ steps.filesfolders.outputs.files }}
- name: Verify that required files are present
run: Rscript scripts/R_ci/files_complete.R ${{ steps.filesfolders.outputs.files }}
- name: Update overview table of all datasets
run: python3 scripts/Python/datasets_overview.py
continue-on-error: true
- name: QSPR-based validation
run: python3 scripts/Python/validation_qspr.py ${{ steps.filesfolders.outputs.files }}
continue-on-error: true
- name: Retention order-based validation for datasets with nominally identical setups
run: python3 scripts/Python/validation_order.py --mode same_condition ${{ steps.filesfolders.outputs.files }}
continue-on-error: true
- name: Retention order-based validation for datasets of systematic measurements
run: python3 scripts/Python/validation_order.py --mode systematic ${{ steps.filesfolders.outputs.files }}
continue-on-error: true
- name: Commit preprocessing
run: |
git config --global user.email '[email protected]'
git config --global user.name 'Github Actions'
# because of dockerized environment, git will otherwise complain about "dubious ownership of directory"
git config --global safe.directory '*'
# Use LFS storage of main repository: no push access to fork LFS storage
# TODO: change once repository is moved
git config lfs.url 'https://github.com/michaelwitting/RepoRT.git/info/lfs'
git add processed_data raw_data
git commit -m "Preprocessing ${{ steps.filesfolders.outputs.files }}"
git lfs push origin HEAD # first push LFS, otherwise failure because of lfs.url
git push origin HEAD
- name: Add comment with report to PR
uses: actions/github-script@v6
with:
script: |
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: (await exec.getExecOutput('python3 scripts/Python/report.py', '${{ steps.filesfolders.outputs.files }}'.trim().split(' '))).stdout
})
continue-on-error: true
- name: Label as successfully preprocessed
if: ${{ success() }}
uses: andymckay/labeler@master
with:
add-labels: "preprocessing successful"
remove-labels: "preprocessing failed"
- name: Add comment on how to proceed with PR
uses: actions/github-script@v6
with:
script: |
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: 'This Pull Request is in "draft" status. When all issues (if any) are addressed, the status can manually be set to "ready for review". Subsequently, the temporary dataset IDs will be replaced with the final ones; then the Pull Request can be merged.'
})
continue-on-error: true
- name: Label as failed
if: ${{ failure() }}
uses: andymckay/labeler@master
with:
add-labels: "preprocessing failed"
remove-labels: "preprocessing successful"