Skip to content

Commit

Permalink
Add testing workflow (#34)
Browse files Browse the repository at this point in the history
* Add testing workflow

* Update test pipeline

* Add test badge to README

* Update testing workflow

* Update testing workflow

* Prepare to check outputs

* Add output checking

* Finish output checking
  • Loading branch information
dfornika authored Feb 15, 2024
1 parent eb38c15 commit 07a25b1
Show file tree
Hide file tree
Showing 11 changed files with 341 additions and 0 deletions.
24 changes: 24 additions & 0 deletions .github/data/illumina-run-simulator/config.edn
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{:instruments [{:instrument-id "M00123"
:output-dir "artifacts/simulated_runs/M00123/22"
:output-dir-structure :old
:instrument-type :miseq
:starting-run-number 300}
{:instrument-id "M00456"
:output-dir "artifacts/simulated_runs/M00456/22"
:output-dir-structure :new
:instrument-type :miseq
:starting-run-number 256}
{:instrument-id "VH00123"
:output-dir "artifacts/simulated_runs/VH00123/22"
:instrument-type :nextseq
:starting-run-number 12}]
:projects ["mysterious_experiment"
"routine_testing"
"quality_check"
"viral_outbreak"
"assay_development"
"42"]
:starting-plate-number 100
:starting-date "2022-06-01"
:run-interval-ms 100
:mark-upload-complete true}
8 changes: 8 additions & 0 deletions .github/data/symlink-seqs/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"sequencing_run_parent_dirs": [
"artifacts/simulated_runs/M00123/22",
"artifacts/simulated_runs/M00456/22",
"artifacts/simulated_runs/VH00123/22"
],
"simplify_sample_id": true
}
31 changes: 31 additions & 0 deletions .github/scripts/add_qc_check_complete.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python

import argparse
import glob
import os
import random
import json


def main(args):
run_dirs_glob = os.path.join(args.simulated_runs_dir, '*', '*', '*')
run_dirs = glob.glob(run_dirs_glob)
for run_dir in run_dirs:
qc_check_complete_file = os.path.join(run_dir, 'qc_check_complete.json')
if not(os.path.exists(qc_check_complete_file)):
qc_check = {}
if random.random() < args.proportion_failed:
qc_check['overall_pass_fail'] = "FAIL"
else:
qc_check['overall_pass_fail'] = "PASS"

with open(qc_check_complete_file, 'w') as f:
f.write(json.dumps(qc_check, indent=2))
f.write('\n')

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--simulated-runs-dir')
parser.add_argument('--proportion-failed', default=0.1, type=float)
args = parser.parse_args()
main(args)
119 changes: 119 additions & 0 deletions .github/scripts/check_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/usr/bin/env python3

import argparse
import csv
import glob
import json
import os
import re


def collect_qc_status_by_run_id(simulated_runs_dir):
"""
Collect the QC status for each simulated run.
:param simulated_runs_dir: Directory containing simulated runs
:type simulated_runs_dir: str
:return: QC status by run ID
:rtype: dict[str, str]
"""
qc_status_by_run_id = {}
simulated_run_dirs_glob = os.path.join(args.simulated_runs_dir, '*', '*', '*')
simulated_run_dirs = glob.glob(simulated_run_dirs_glob)

for run_dir in simulated_run_dirs:
run_id = os.path.basename(run_dir)
qc_check_complete_file = os.path.join(run_dir, 'qc_check_complete.json')
with open(qc_check_complete_file, 'r') as f:
qc_check = json.load(f)
qc_status_by_run_id[run_id] = qc_check['overall_pass_fail']

return qc_status_by_run_id


def parse_symlink_seqs_output_csv(symlink_seqs_output_csv):
"""
"""
symlink_seqs_output = []
miseq_regex = '\\d{6}_M\\d{5}_\\d{4}_\\d{9}-[A-Z0-9]{5}'
nextseq_regex = '\d{6}_VH\d{5}_\d+_[A-Z0-9]{9}'
with open(symlink_seqs_output_csv, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
miseq_match = re.search(miseq_regex, row['R1'])
nextseq_match = re.search(nextseq_regex, row['R1'])
run_id = None
if nextseq_match:
run_id = nextseq_match.group(0)
elif miseq_match:
run_id = miseq_match.group(0)
row['RUN_ID'] = run_id
symlink_seqs_output.append(row)

return symlink_seqs_output


def check_no_qc_failed_runs_are_symlinked(symlink_seqs_output, qc_status_by_run_id):
"""
Check that no QC failed runs are symlinked.
:param symlink_seqs_output: Symlink-seqs output
:type symlink_seqs_output: list[dict[str, str]]
:param qc_status_by_run_id: QC status by run ID
:type qc_status_by_run_id: dict[str, str]
:return: Whether or not no QC failed runs are symlinked (True if no QC failed runs are symlinked, False otherwise)
:rtype: bool
"""
qc_statuses = []
for library in symlink_seqs_output:
run_id = library['RUN_ID']
qc_status = qc_status_by_run_id[run_id]
qc_statuses.append(qc_status)

all_qc_statuses_passed = all([qc_status == 'PASS' for qc_status in qc_statuses])

return all_qc_statuses_passed


def main(args):
qc_status_by_run_id = collect_qc_status_by_run_id(args.simulated_runs_dir)

symlink_seqs_output = parse_symlink_seqs_output_csv(args.symlink_seqs_output_csv)

no_qc_failed_runs_are_symlinked = check_no_qc_failed_runs_are_symlinked(symlink_seqs_output, qc_status_by_run_id)

tests = [
{
'test_name': 'no_qc_failed_runs_are_symlinked',
'test_passed': no_qc_failed_runs_are_symlinked,
},
]

output_fields = [
"test_name",
"test_result"
]

output_path = args.output
with open(output_path, 'w') as f:
writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore')
writer.writeheader()
for test in tests:
if test["test_passed"]:
test["test_result"] = "PASS"
else:
test["test_result"] = "FAIL"
writer.writerow(test)

for test in tests:
if not test['test_passed']:
exit(1)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--simulated-runs-dir', default='artifacts/simulated_runs', help='Directory containing simulated runs')
parser.add_argument('--symlink-seqs-output-csv', default='artifacts/symlink-seqs/mysterious_experiment.csv', help='Path to symlink-seqs output CSV')
parser.add_argument('-o', '--output', type=str, help='Path to the output file')
args = parser.parse_args()
main(args)
3 changes: 3 additions & 0 deletions .github/scripts/check_outputs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

.github/scripts/check_outputs.py -o artifacts/check_outputs_results.csv
80 changes: 80 additions & 0 deletions .github/scripts/copy_samplesheets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env python3

# This script is used to temporarily fix an issue with the illumina-run-simulator
# where the samplesheet is not being written to the demultiplexed output directory.

import argparse
import glob
import json
import os
import shutil


def get_instrument_type(run_id):
"""
Get the instrument type ("miseq" or "nextseq"), based on the run ID.
:param run_id: Run directory
:type run_id: str
:return: Instrument type
:rtype: str|None
"""
instrument_type = None
instrument_id = run_id.split('_')[1]
if instrument_id.startswith('M'):
instrument_type = 'miseq'
elif instrument_id.startswith('V'):
instrument_type = 'nextseq'

return instrument_type


def get_demultiplexing_outdir(run):
"""
Find the appropriate subdirectory to copy the samplesheet to.
:param run: Run directory
:type run: str
:return: Demultiplexing output directory
:rtype: str|None
"""
demultiplexing_outdir = None
run_id = os.path.basename(run)
instrument_type = get_instrument_type(run_id)
if instrument_type == 'miseq':
if os.path.exists(os.path.join(run, 'Data')):
demultiplexing_outdir = os.path.join(run)
else:
demultiplexing_outdir_glob = os.path.join(run, 'Alignment_*', '*')
demultiplexing_outdir = sorted(glob.glob(demultiplexing_outdir_glob), reverse=True)[0]
elif instrument_type == 'nextseq':
demultiplexing_outdir_glob = os.path.join(run, 'Analysis', '*', 'Data')
demultiplexing_outdir = sorted(glob.glob(demultiplexing_outdir_glob), reverse=True)[0]

return demultiplexing_outdir


def main(args):
simulated_runs_glob = os.path.join(args.simulated_runs_dir, '*', '*', '*')
simulated_runs = glob.glob(simulated_runs_glob)
for run in simulated_runs:
samplesheet_path_src = os.path.join(run, 'SampleSheet.csv')
run_id = os.path.basename(run)
instrument_type = get_instrument_type(run_id)
if instrument_type is None:
continue
demultiplexing_outdir = get_demultiplexing_outdir(run)
samplesheet_path_dst = os.path.join(demultiplexing_outdir, 'SampleSheet.csv')
try:
shutil.copy(samplesheet_path_src, samplesheet_path_dst)
except FileNotFoundError:
continue
except shutil.SameFileError:
continue


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='')
parser.add_argument('--simulated-runs-dir', default='artifacts/simulated_runs', help='Directory containing simulated runs')
args = parser.parse_args()
main(args)
3 changes: 3 additions & 0 deletions .github/scripts/download_run_simulator.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

wget https://github.com/dfornika/illumina-run-simulator/releases/download/v0.1.0/illumina-run-simulator-0.1.0-standalone.jar -O illumina-run-simulator.jar
10 changes: 10 additions & 0 deletions .github/scripts/run_symlink-seqs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

mkdir -p artifacts/symlink-seqs

./symlink-seqs \
--config .github/data/symlink-seqs/config.json \
--project-id 'mysterious_experiment' \
--csv \
-o artifacts/symlinks/mysterious_experiment \
> artifacts/symlink-seqs/mysterious_experiment.csv
16 changes: 16 additions & 0 deletions .github/scripts/simulate_runs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

mkdir -p artifacts/simulated_runs/{M00123,M00456,VH00123}/22

timeout 10 java -jar illumina-run-simulator.jar \
--config .github/data/illumina-run-simulator/config.edn \
2> >(tee artifacts/illumina-run-simulator.log.jsonl) \
|| code=$?;

if [[ $code -ne 124 && $code -ne 0 ]]; then
exit $code;
fi

.github/scripts/add_qc_check_complete.py --simulated-runs-dir artifacts/simulated_runs

.github/scripts/copy_samplesheets.py --simulated-runs-dir artifacts/simulated_runs
44 changes: 44 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: "Tests"
on:
push:
branches:
- main
pull_request:
branches:
- main
workflow_dispatch:
jobs:
test:
strategy:
fail-fast: false
matrix:
python_version: ["3.9", "3.10", "3.11", "3.12"]
name: Run tests
runs-on: ubuntu-latest
steps:
- name: Checkout Repo
uses: actions/checkout@v1
- name: Create artifacts directory
run: |
mkdir -p artifacts/artifacts
- name: Setup Java
uses: actions/setup-java@v3
with:
distribution: 'temurin'
java-version: '21'
- name: Download Run Simulator
run: |
bash .github/scripts/download_run_simulator.sh
- name: Simulate Runs
run: |
bash .github/scripts/simulate_runs.sh
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python_version }}
- name: Run symlink-seqs
run: |
bash .github/scripts/run_symlink-seqs.sh
- name: Check Outputs
run: |
bash .github/scripts/check_outputs.sh
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
[![Tests](https://github.com/BCCDC-PHL/symlink-seqs/actions/workflows/tests.yml/badge.svg)](https://github.com/BCCDC-PHL/symlink-seqs/actions/workflows/tests.yml)

# symlink-seqs

Create fastq symlinks for selected samples in sequencer output directories based on project ID from SampleSheet files.

## Usage
Expand Down

0 comments on commit 07a25b1

Please sign in to comment.