Skip to content

new dataset 0390

new dataset 0390 #160

Workflow file for this run

name: Preprocessing
on:
pull_request_target:
# (re)opened PR or new commit in fork
types: [ opened, synchronize, reopened ]
paths:
- 'raw_data/**'
- 'processed_data/**'
jobs:
preprocess:
name: Preprocess raw data
# NOTE: on windows as computing of descriptors has a bug on linux right now
runs-on: windows-2019
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} # needed for pulling R packages from github
steps:
- name: Checkout fork repository
uses: actions/checkout@v3
with:
fetch-depth: 0
repository: ${{github.event.pull_request.head.repo.full_name}}
ref: ${{ github.head_ref }}
lfs: true
- name: Get changed files
id: files
uses: Ana06/get-changed-files@v2.2.0
- name: Get new/changed datasets
id: filesfolders
shell: bash {0}
run: echo "files=$(for f in ${{ steps.files.outputs.all }}; do basename $(dirname $f); done | grep -E '^[0-9]+$' | sort | uniq | tr '\n' ' ')" >> $GITHUB_OUTPUT
- name: List all added files
shell: bash {0}
run: |
for f in ${{ steps.filesfolders.outputs.files }}; do
ls -lh raw_data/$f
done
- name: Pyton dependencies
run: pip install -r scripts/Python/requirements.txt
- name: Setup java
uses: actions/setup-java@v3
with:
distribution: 'temurin'
java-version: '17'
- name: Set RENV_PATHS_ROOT
shell: bash
run: |
echo "RENV_PATHS_ROOT=${{ runner.temp }}/renv" >> $GITHUB_ENV
- name: Setup R
uses: r-lib/actions/setup-r@v2
- name: Restore Renv package cache
uses: actions/cache@v3
with:
path: ${{ env.RENV_PATHS_ROOT }}
key: ${{ runner.os }}-renv-${{ hashFiles('**/renv.lock') }}
restore-keys: |
${{ runner.os }}-renv-
- name: Install and activate renv
shell: Rscript {0}
run: |
install.packages("renv")
renv::restore()
- name: Standardize compounds
run: Rscript scripts/R_ci/compounds_standardize.R ${{ steps.filesfolders.outputs.files }}
- name: Compounds classyfire classes
run: Rscript scripts/R_ci/compounds_classyfire.R ${{ steps.filesfolders.outputs.files }}
- name: Compounds descriptors
run: Rscript scripts/R_ci/compounds_descriptors.R ${{ steps.filesfolders.outputs.files }}
- name: Compounds fingerprints
run: Rscript scripts/R_ci/compounds_fingerprints.R ${{ steps.filesfolders.outputs.files }}
- name: Metadata standardization
run: Rscript scripts/R_ci/metadata_standardize.R ${{ steps.filesfolders.outputs.files }}
- name: Generate dataset reports
run: Rscript scripts/R_ci/compounds_overview.R ${{ steps.filesfolders.outputs.files }}
- name: Verify that required files are present
run: Rscript scripts/R_ci/files_complete.R ${{ steps.filesfolders.outputs.files }}
- name: Update overview table of all datasets
run: python3 scripts/Python/datasets_overview.py
continue-on-error: true
- name: QSPR-based validation
run: python3 scripts/Python/validation_qspr.py ${{ steps.filesfolders.outputs.files }}
continue-on-error: true
- name: Retention order-based validation for datasets with nominally identical setups
run: python3 scripts/Python/validation_order.py --mode same_condition ${{ steps.filesfolders.outputs.files }}
continue-on-error: true
- name: Retention order-based validation for datasets of systematic measurements
run: python3 scripts/Python/validation_order.py --mode systematic ${{ steps.filesfolders.outputs.files }}
continue-on-error: true
- name: Commit preprocessing
run: |
git config --global user.email 'actions@github.com'
git config --global user.name 'Github Actions'
# Use LFS storage of main repository: no push access to fork LFS storage
# TODO: change once repository is moved
git config lfs.url 'https://github.com/michaelwitting/RepoRT.git/info/lfs'
git add processed_data raw_data
git commit -m "Preprocessing ${{ steps.filesfolders.outputs.files }}"
git lfs push origin HEAD # first push LFS, otherwise failure because of lfs.url
git push origin HEAD
- name: Add comment with report to PR
uses: actions/github-script@v6
with:
script: |
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: (await exec.getExecOutput('python3 scripts/Python/report.py', '${{ steps.filesfolders.outputs.files }}'.trim().split(' '))).stdout
})
continue-on-error: true
- name: Label as successfully preprocessed
if: ${{ success() }}
uses: andymckay/labeler@master
with:
add-labels: "preprocessing successful"
remove-labels: "preprocessing failed"
- name: Debug with tmate on failure
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3
- name: Label as failed
if: ${{ failure() }}
uses: andymckay/labeler@master
with:
add-labels: "preprocessing failed"
remove-labels: "preprocessing successful"