Add testing workflow (#34)

* Add testing workflow * Update test pipeline * Add test badge to README * Update testing workflow * Update testing workflow * Prepare to check outputs * Add output checking * Finish output checking
BCCDC-PHL · Feb 15, 2024 · 07a25b1 · 07a25b1
1 parent eb38c15
commit 07a25b1
Show file tree

Hide file tree

Showing 11 changed files with 341 additions and 0 deletions.
diff --git a/.github/data/illumina-run-simulator/config.edn b/.github/data/illumina-run-simulator/config.edn
@@ -0,0 +1,24 @@
+{:instruments [{:instrument-id "M00123"
+                :output-dir "artifacts/simulated_runs/M00123/22"
+                :output-dir-structure :old
+                :instrument-type :miseq
+                :starting-run-number 300}
+               {:instrument-id "M00456"
+                :output-dir "artifacts/simulated_runs/M00456/22"
+                :output-dir-structure :new
+                :instrument-type :miseq
+                :starting-run-number 256}
+               {:instrument-id "VH00123"
+                :output-dir "artifacts/simulated_runs/VH00123/22"
+                :instrument-type :nextseq
+                :starting-run-number 12}]
+ :projects ["mysterious_experiment"
+            "routine_testing"
+            "quality_check"
+            "viral_outbreak"
+            "assay_development"
+            "42"]
+ :starting-plate-number 100
+ :starting-date "2022-06-01"
+ :run-interval-ms 100
+ :mark-upload-complete true}
diff --git a/.github/data/symlink-seqs/config.json b/.github/data/symlink-seqs/config.json
@@ -0,0 +1,8 @@
+{
+    "sequencing_run_parent_dirs": [
+	"artifacts/simulated_runs/M00123/22",
+	"artifacts/simulated_runs/M00456/22",
+	"artifacts/simulated_runs/VH00123/22"
+    ],
+    "simplify_sample_id": true
+}
diff --git a/.github/scripts/add_qc_check_complete.py b/.github/scripts/add_qc_check_complete.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+
+import argparse
+import glob
+import os
+import random
+import json
+
+
+def main(args):
+    run_dirs_glob = os.path.join(args.simulated_runs_dir, '*', '*', '*')
+    run_dirs = glob.glob(run_dirs_glob)
+    for run_dir in run_dirs:
+        qc_check_complete_file = os.path.join(run_dir, 'qc_check_complete.json')
+        if not(os.path.exists(qc_check_complete_file)):
+            qc_check = {}
+            if random.random() < args.proportion_failed:
+                qc_check['overall_pass_fail'] = "FAIL"
+            else:
+                qc_check['overall_pass_fail'] = "PASS"
+
+            with open(qc_check_complete_file, 'w') as f:
+                f.write(json.dumps(qc_check, indent=2))
+                f.write('\n')
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--simulated-runs-dir')
+    parser.add_argument('--proportion-failed', default=0.1, type=float)
+    args = parser.parse_args()
+    main(args)
diff --git a/.github/scripts/check_outputs.py b/.github/scripts/check_outputs.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import glob
+import json
+import os
+import re
+
+
+def collect_qc_status_by_run_id(simulated_runs_dir):
+    """
+    Collect the QC status for each simulated run.
+
+    :param simulated_runs_dir: Directory containing simulated runs
+    :type simulated_runs_dir: str
+    :return: QC status by run ID
+    :rtype: dict[str, str]
+    """
+    qc_status_by_run_id = {}
+    simulated_run_dirs_glob = os.path.join(args.simulated_runs_dir, '*', '*', '*')
+    simulated_run_dirs = glob.glob(simulated_run_dirs_glob)
+
+    for run_dir in simulated_run_dirs:
+        run_id = os.path.basename(run_dir)
+        qc_check_complete_file = os.path.join(run_dir, 'qc_check_complete.json')
+        with open(qc_check_complete_file, 'r') as f:
+            qc_check = json.load(f)
+            qc_status_by_run_id[run_id] = qc_check['overall_pass_fail']
+
+    return qc_status_by_run_id
+
+
+def parse_symlink_seqs_output_csv(symlink_seqs_output_csv):
+    """
+    """
+    symlink_seqs_output = []
+    miseq_regex = '\\d{6}_M\\d{5}_\\d{4}_\\d{9}-[A-Z0-9]{5}'
+    nextseq_regex = '\d{6}_VH\d{5}_\d+_[A-Z0-9]{9}'
+    with open(symlink_seqs_output_csv, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            miseq_match = re.search(miseq_regex, row['R1'])
+            nextseq_match = re.search(nextseq_regex, row['R1'])
+            run_id = None
+            if nextseq_match:
+                run_id = nextseq_match.group(0)
+            elif miseq_match:
+                run_id = miseq_match.group(0)
+            row['RUN_ID'] = run_id
+            symlink_seqs_output.append(row)
+
+    return symlink_seqs_output
+
+
+def check_no_qc_failed_runs_are_symlinked(symlink_seqs_output, qc_status_by_run_id):
+    """
+    Check that no QC failed runs are symlinked.
+
+    :param symlink_seqs_output: Symlink-seqs output
+    :type symlink_seqs_output: list[dict[str, str]]
+    :param qc_status_by_run_id: QC status by run ID
+    :type qc_status_by_run_id: dict[str, str]
+    :return: Whether or not no QC failed runs are symlinked (True if no QC failed runs are symlinked, False otherwise)
+    :rtype: bool
+    """
+    qc_statuses = []
+    for library in symlink_seqs_output:
+        run_id = library['RUN_ID']
+        qc_status = qc_status_by_run_id[run_id]
+        qc_statuses.append(qc_status)
+
+    all_qc_statuses_passed = all([qc_status == 'PASS' for qc_status in qc_statuses])
+
+    return all_qc_statuses_passed
+
+
+def main(args):
+    qc_status_by_run_id = collect_qc_status_by_run_id(args.simulated_runs_dir)
+
+    symlink_seqs_output = parse_symlink_seqs_output_csv(args.symlink_seqs_output_csv)
+
+    no_qc_failed_runs_are_symlinked = check_no_qc_failed_runs_are_symlinked(symlink_seqs_output, qc_status_by_run_id)
+
+    tests = [
+        {
+            'test_name': 'no_qc_failed_runs_are_symlinked',
+            'test_passed': no_qc_failed_runs_are_symlinked,
+        },
+    ]
+
+    output_fields = [
+        "test_name",
+        "test_result"
+    ]
+
+    output_path = args.output
+    with open(output_path, 'w') as f:
+        writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore')
+        writer.writeheader()
+        for test in tests:
+            if test["test_passed"]:
+                test["test_result"] = "PASS"
+            else:
+                test["test_result"] = "FAIL"
+            writer.writerow(test)
+
+    for test in tests:
+        if not test['test_passed']:
+            exit(1)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--simulated-runs-dir', default='artifacts/simulated_runs', help='Directory containing simulated runs')
+    parser.add_argument('--symlink-seqs-output-csv', default='artifacts/symlink-seqs/mysterious_experiment.csv', help='Path to symlink-seqs output CSV')
+    parser.add_argument('-o', '--output', type=str, help='Path to the output file')
+    args = parser.parse_args()
+    main(args)
diff --git a/.github/scripts/check_outputs.sh b/.github/scripts/check_outputs.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+.github/scripts/check_outputs.py -o artifacts/check_outputs_results.csv
diff --git a/.github/scripts/copy_samplesheets.py b/.github/scripts/copy_samplesheets.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+# This script is used to temporarily fix an issue with the illumina-run-simulator
+# where the samplesheet is not being written to the demultiplexed output directory.
+
+import argparse
+import glob
+import json
+import os
+import shutil
+
+
+def get_instrument_type(run_id):
+    """
+    Get the instrument type ("miseq" or "nextseq"), based on the run ID.
+
+    :param run_id: Run directory
+    :type run_id: str
+    :return: Instrument type
+    :rtype: str|None
+    """
+    instrument_type = None
+    instrument_id = run_id.split('_')[1]
+    if instrument_id.startswith('M'):
+        instrument_type = 'miseq'
+    elif instrument_id.startswith('V'):
+        instrument_type = 'nextseq'
+
+    return instrument_type
+
+
+def get_demultiplexing_outdir(run):
+    """
+    Find the appropriate subdirectory to copy the samplesheet to.
+
+    :param run: Run directory
+    :type run: str
+    :return: Demultiplexing output directory
+    :rtype: str|None
+    """
+    demultiplexing_outdir = None
+    run_id = os.path.basename(run)
+    instrument_type = get_instrument_type(run_id)
+    if instrument_type == 'miseq':
+        if os.path.exists(os.path.join(run, 'Data')):
+            demultiplexing_outdir = os.path.join(run)
+        else:
+            demultiplexing_outdir_glob = os.path.join(run, 'Alignment_*', '*')
+            demultiplexing_outdir = sorted(glob.glob(demultiplexing_outdir_glob), reverse=True)[0]
+    elif instrument_type == 'nextseq':
+        demultiplexing_outdir_glob = os.path.join(run, 'Analysis', '*', 'Data')
+        demultiplexing_outdir = sorted(glob.glob(demultiplexing_outdir_glob), reverse=True)[0]
+
+    return demultiplexing_outdir
+
+
+def main(args):
+    simulated_runs_glob = os.path.join(args.simulated_runs_dir, '*', '*', '*')
+    simulated_runs = glob.glob(simulated_runs_glob)
+    for run in simulated_runs:
+        samplesheet_path_src = os.path.join(run, 'SampleSheet.csv')
+        run_id = os.path.basename(run)
+        instrument_type = get_instrument_type(run_id)
+        if instrument_type is None:
+            continue
+        demultiplexing_outdir = get_demultiplexing_outdir(run)
+        samplesheet_path_dst = os.path.join(demultiplexing_outdir, 'SampleSheet.csv')
+        try:
+            shutil.copy(samplesheet_path_src, samplesheet_path_dst)
+        except FileNotFoundError:
+            continue
+        except shutil.SameFileError:
+            continue
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('--simulated-runs-dir', default='artifacts/simulated_runs', help='Directory containing simulated runs')
+    args = parser.parse_args()
+    main(args)
diff --git a/.github/scripts/download_run_simulator.sh b/.github/scripts/download_run_simulator.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+wget https://github.com/dfornika/illumina-run-simulator/releases/download/v0.1.0/illumina-run-simulator-0.1.0-standalone.jar -O illumina-run-simulator.jar
diff --git a/.github/scripts/run_symlink-seqs.sh b/.github/scripts/run_symlink-seqs.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+mkdir -p artifacts/symlink-seqs
+
+./symlink-seqs \
+    --config .github/data/symlink-seqs/config.json \
+    --project-id 'mysterious_experiment' \
+    --csv \
+    -o artifacts/symlinks/mysterious_experiment \
+    > artifacts/symlink-seqs/mysterious_experiment.csv
diff --git a/.github/scripts/simulate_runs.sh b/.github/scripts/simulate_runs.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+mkdir -p artifacts/simulated_runs/{M00123,M00456,VH00123}/22
+
+timeout 10 java -jar illumina-run-simulator.jar \
+	--config .github/data/illumina-run-simulator/config.edn \
+	2> >(tee artifacts/illumina-run-simulator.log.jsonl) \
+    || code=$?;
+
+if [[ $code -ne 124 && $code -ne 0 ]]; then
+    exit $code;
+fi
+
+.github/scripts/add_qc_check_complete.py --simulated-runs-dir artifacts/simulated_runs
+
+.github/scripts/copy_samplesheets.py --simulated-runs-dir artifacts/simulated_runs
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,44 @@
+name: "Tests"
+on: 
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+  workflow_dispatch:
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.9", "3.10", "3.11", "3.12"]
+    name: Run tests
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v1
+    - name: Create artifacts directory
+      run: |
+        mkdir -p artifacts/artifacts
+    - name: Setup Java
+      uses: actions/setup-java@v3
+      with:
+        distribution: 'temurin'
+        java-version: '21'
+    - name: Download Run Simulator
+      run: |
+        bash .github/scripts/download_run_simulator.sh
+    - name: Simulate Runs
+      run: |
+        bash .github/scripts/simulate_runs.sh
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python_version }}
+    - name: Run symlink-seqs
+      run: |
+        bash .github/scripts/run_symlink-seqs.sh
+    - name: Check Outputs
+      run: |
+        bash .github/scripts/check_outputs.sh
diff --git a/README.md b/README.md
@@ -1,4 +1,7 @@
+[![Tests](https://github.com/BCCDC-PHL/symlink-seqs/actions/workflows/tests.yml/badge.svg)](https://github.com/BCCDC-PHL/symlink-seqs/actions/workflows/tests.yml)
+
 # symlink-seqs
+
 Create fastq symlinks for selected samples in sequencer output directories based on project ID from SampleSheet files.
 
 ## Usage