Skip to content

Commit

Permalink
Handle empty files (#5720)
Browse files Browse the repository at this point in the history
* fix: emit all fastqs, remove global log file

* fix: remove rg parseing

* fix(subworkflow/bcl_demultiplex): remove rg parsing, output empty fastqs, remove log file

* test: update snaps

* lint: traill\ing whitespace

* feat: include read group parsing and empty file check in single closure

* test: update snapshot

* fix: lint final new line

* fix: lint trailing whitespace

* fix: add branch and emit empty fastq channel

* tests: update snaps

* lint: trailing whitespace

* fix: filter bool and add test

* test: update snaps

* test: assert empty file exist

* test: assert empty file exist

* test: check file exists, update snap

* Update test.yml

* Update test.yml

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* ci: debug nf-test failure

* lint: fix lint

---------

Co-authored-by: khazen@clearnotehealth.com <--unset>
Co-authored-by: Simon Pearce <24893913+SPPearce@users.noreply.github.com>
  • Loading branch information
k1sauce and SPPearce authored Jul 15, 2024
1 parent 5c907c4 commit cc87c4d
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 144 deletions.
168 changes: 65 additions & 103 deletions subworkflows/nf-core/bcl_demultiplex/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,9 @@
include { BCLCONVERT } from "../../../modules/nf-core/bclconvert/main"
include { BCL2FASTQ } from "../../../modules/nf-core/bcl2fastq/main"

// Define the log file path before the workflow starts
def logFile = new File("${params.outdir}/invalid_fastqs.log")

workflow BCL_DEMULTIPLEX {
take:
ch_flowcell // [[id:"", lane:""],samplesheet.csv, path/to/bcl/files]
ch_flowcell // [[id:"", lane:""], samplesheet.csv, path/to/bcl/files]
demultiplexer // bclconvert or bcl2fastq

main:
Expand Down Expand Up @@ -67,106 +64,71 @@ workflow BCL_DEMULTIPLEX {
}

// Generate meta for each fastq
ch_fastq_with_meta = generate_fastq_meta(ch_fastq, logFile)

emit:
fastq = ch_fastq_with_meta
reports = ch_reports
stats = ch_stats
interop = ch_interop
versions = ch_versions
}

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
FUNCTIONS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

// This function appends a given text to a specified log file.
// If the log file does not exist, it creates a new one.
def appendToLogFile(String text, File logFile) {
if (!logFile.exists()) {
logFile.createNewFile()
}
// Convert the text to String if it's a GString
String textToWrite = text.toString()
logFile << textToWrite + "\n" // Appends the text to the file with a new line
}

// Add meta values to fastq channel and skip invalid FASTQ files
def generate_fastq_meta(ch_reads, logFile) {
// Create a tuple with the meta.id and the fastq
ch_reads.transpose().map { fc_meta, fastq ->
// Check if the FASTQ file is empty or has invalid content
def isValid = fastq.withInputStream { is ->
new java.util.zip.GZIPInputStream(is).withReader('ASCII') { reader ->
def line = reader.readLine()
line != null && line.startsWith('@')
}
}

def meta = null
if (isValid) {
meta = [
"id": fastq.getSimpleName().toString() - ~/_R[0-9]_001.*$/,
"samplename": fastq.getSimpleName().toString() - ~/_S[0-9]+.*$/,
"readgroup": [:],
"fcid": fc_meta.id,
"lane": fc_meta.lane
]
meta.readgroup = readgroup_from_fastq(fastq)
meta.readgroup.SM = meta.samplename
} else {
appendToLogFile(
"Empty or invalid FASTQ file: ${fastq}",
logFile
)
fastq = null
ch_fastq_with_meta = ch_fastq
// reshapes the channel from a single emit of [meta, [fastq, fastq, fastq...]]
// to emits per fastq file like [meta, fastq]
.transpose()
.map { fc_meta, fastq ->
def meta = [:]
meta.id = fastq.getSimpleName().toString() - ~/_R[0-9]_001.*$/
meta.samplename = fastq.getSimpleName().toString() - ~/_S[0-9]+.*$/
meta.fcid = fc_meta.id
meta.lane = fc_meta.lane
// The buffered input stream allows reading directly from cloud storage
// It will not make a local copy of the file.
fastq.withInputStream {
InputStream gzipStream = new java.util.zip.GZIPInputStream(it)
Reader decoder = new InputStreamReader(gzipStream, 'ASCII')
BufferedReader buffered = new BufferedReader(decoder)
line = buffered.readLine()
buffered.close()
}

return [meta, fastq]
}.filter { it[0] != null }
// Group by meta.id for PE samples
.groupTuple(by: [0])
// Add meta.single_end
.map { meta, fastq ->
if (meta != null) {
meta.single_end = fastq.size() == 1
if ( line != null && line.startsWith('@') ) {
line = line.substring(1)
// expected format is like:
// xx:yy:FLOWCELLID:LANE:... (seven fields)
fields = line.split(':')
// CASAVA 1.8+ format, from https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
// "@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>"
sequencer_serial = fields[0]
run_nubmer = fields[1]
fcid = fields[2]
lane = fields[3]
index = fields[-1] =~ /[GATC+-]/ ? fields[-1] : ""
ID = [fcid, lane].join(".")
PU = [fcid, lane, index].findAll().join(".")
PL = "ILLUMINA"
SM = fastq.getSimpleName().toString() - ~/_S[0-9]+.*$/
meta.readgroup = [
"ID": ID,
"SM": SM,
"PL": PL,
"PU": PU
]
meta.empty = false
} else {
println "No reads were found in FASTQ file: ${fastq}"
meta.readgroup = [:]
meta.empty = true
}
return [meta, fastq.flatten()]
}
}

// https://github.com/nf-core/sarek/blob/7ba61bde8e4f3b1932118993c766ed33b5da465e/workflows/sarek.nf#L1014-L1040
def readgroup_from_fastq(path) {
// expected format:
// xx:yy:FLOWCELLID:LANE:... (seven fields)

def line

path.withInputStream {
InputStream gzipStream = new java.util.zip.GZIPInputStream(it)
Reader decoder = new InputStreamReader(gzipStream, 'ASCII')
BufferedReader buffered = new BufferedReader(decoder)
line = buffered.readLine()
}
assert line.startsWith('@')
line = line.substring(1)
def fields = line.split(':')
def rg = [:]

// CASAVA 1.8+ format, from https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
// "@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>"
sequencer_serial = fields[0]
run_nubmer = fields[1]
fcid = fields[2]
lane = fields[3]
index = fields[-1] =~ /[GATC+-]/ ? fields[-1] : ""

rg.ID = [fcid,lane].join(".")
rg.PU = [fcid, lane, index].findAll().join(".")
rg.PL = "ILLUMINA"
return [meta, fastq]
}
// Group by the meta id so that we can find mate pairs if they exist
.groupTuple(by: [0])
.map { meta, fastq ->
meta.single_end = fastq.size() == 1
return [meta, fastq.flatten()]
}
.branch {
fastq : it[0].empty == false
empty_fastq : it[0].empty == true
}

return rg
emit:
fastq = ch_fastq_with_meta.fastq
empty_fastq = ch_fastq_with_meta.empty_fastq
reports = ch_reports
stats = ch_stats
interop = ch_interop
versions = ch_versions
}
13 changes: 7 additions & 6 deletions subworkflows/nf-core/bcl_demultiplex/tests/main.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ nextflow_workflow {
workflow {
"""
input[0] = Channel.value([
[id:'test', lane:1 ],
"https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bcl/flowcell_samplesheet.csv",
"https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bcl/flowcell.tar.gz"
])
[id:'HMTFYDRXX'],
"https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/illumina/bcl/SampleSheet.csv",
"https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/illumina/bcl/200624_A00834_0183_BHMTFYDRXX.tar.gz"
])
input[1] = "bclconvert"
"""
}
Expand All @@ -39,7 +39,8 @@ nextflow_workflow {
workflow.out.interop.get(0).get(1).findAll { file(it).name != "IndexMetricsOut.bin" },
).match()
},
{ assert file(workflow.out.interop.get(0).get(1).find { file(it).name == "IndexMetricsOut.bin" }).exists() }
{ assert file(workflow.out.interop.get(0).get(1).find { file(it).name == "IndexMetricsOut.bin" }).exists() },
{ assert file(workflow.out.empty_fastq.get(0).get(1).find { file(it).name == "SampleZ_S5_L001_R1_001.fastq.gz" }).exists() }
)
}
}
Expand All @@ -54,7 +55,7 @@ nextflow_workflow {
[id:'test', lane:1 ],
"https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bcl/flowcell_samplesheet.csv",
"https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bcl/flowcell.tar.gz"
])
])
input[1] = "bcl2fastq"
"""
}
Expand Down
127 changes: 95 additions & 32 deletions subworkflows/nf-core/bcl_demultiplex/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,21 @@
[
[
{
"id": "test",
"lane": 1
"id": "HMTFYDRXX"
},
[
"Adapter_Cycle_Metrics.csv:md5,5a0c88793b4a0885fe3dda16609b576e",
"Adapter_Metrics.csv:md5,989240b8840b2169ac1061f952c90f6c",
"Demultiplex_Stats.csv:md5,93949a8cd96f907d83e0808c1ec2a04b",
"Demultiplex_Tile_Stats.csv:md5,83120160b0f22a1303fa1db31c19f6e9",
"IndexMetricsOut.bin:md5,9e688c58a5487b8eaf69c9e1005ad0bf",
"Index_Hopping_Counts.csv:md5,1059369e375fd8f8423c0f6c934be978",
"Quality_Metrics.csv:md5,6614accb1bb414fe312b17b81f5521f7",
"Quality_Tile_Metrics.csv:md5,cdc89fd2962bdd4a24f71e186112118a",
"RunInfo.xml:md5,03038959f4dd181c86bc97ae71fe270a",
"SampleSheet.csv:md5,dc0dffd39541dd6cc5b4801d768a8d2b",
"Top_Unknown_Barcodes.csv:md5,2e2faba761137f228e56bd3428453ccc",
"fastq_list.csv:md5,05bc84f51840f5754cfb8381b36f2cb0"
"Adapter_Cycle_Metrics.csv:md5,05fbe7b2b0acdd557d355b448aa88ace",
"Adapter_Metrics.csv:md5,0fa4ac708955417af9d18cec4955552f",
"Demultiplex_Stats.csv:md5,4a3f451faa098156623b55b0f2ff27ee",
"Demultiplex_Tile_Stats.csv:md5,8f6fb58990572c4aa19c0100d8351484",
"IndexMetricsOut.bin:md5,fb16c8a9873e5b5950ae5949126af76c",
"Index_Hopping_Counts.csv:md5,f59474d96afe8218c7590bb240b19690",
"Quality_Metrics.csv:md5,c4622066f85d93b1661c928a46cfc508",
"Quality_Tile_Metrics.csv:md5,e22bc5e2f147695150b02afcccb38c4f",
"RunInfo.xml:md5,f283cb4600235db9261ee1e319b1407e",
"SampleSheet.csv:md5,4113eabae23136cc819c7f15ac5b6aad",
"Top_Unknown_Barcodes.csv:md5,37dbc2860c640fc721820b0217ea0504",
"fastq_list.csv:md5,b2409de8a184e83554766cd4460240a4"
]
]
],
Expand All @@ -31,38 +30,101 @@
{
"id": "Sample1_S1_L001",
"samplename": "Sample1",
"fcid": "HMTFYDRXX",
"readgroup": {
"ID": "000000000-K9H97.1",
"PU": "000000000-K9H97.1",
"ID": "HMTFYDRXX.1",
"SM": "Sample1",
"PL": "ILLUMINA",
"SM": "Sample1"
"PU": "HMTFYDRXX.1.GAACTGAGCG+TCGTGGAGCG"
},
"fcid": "test",
"lane": 1,
"empty": false,
"single_end": true
},
[
"Sample1_S1_L001_R1_001.fastq.gz:md5,0675fb6365322eaafb33c0f8e862b54b"
"Sample1_S1_L001_R1_001.fastq.gz:md5,b5489d1964db8db5502eb742cc3ef3ec"
]
],
[
{
"id": "Sample23_S3_L001",
"samplename": "Sample23",
"fcid": "HMTFYDRXX",
"readgroup": {
"ID": "HMTFYDRXX.1",
"SM": "Sample23",
"PL": "ILLUMINA",
"PU": "HMTFYDRXX.1.CGTCTCATAT+TATAGTAGCT"
},
"empty": false,
"single_end": true
},
[
"Sample23_S3_L001_R1_001.fastq.gz:md5,767a1091320320b140288066e29bccc5"
]
],
[
{
"id": "SampleA_S2_L001",
"samplename": "SampleA",
"fcid": "HMTFYDRXX",
"readgroup": {
"ID": "HMTFYDRXX.1",
"SM": "SampleA",
"PL": "ILLUMINA",
"PU": "HMTFYDRXX.1.AGGTCAGATA+CTACAAGATA"
},
"empty": false,
"single_end": true
},
[
"SampleA_S2_L001_R1_001.fastq.gz:md5,7de2ea88133409f34563f40a0d8c9e55"
]
],
[
{
"id": "sampletest_S4_L001",
"samplename": "sampletest",
"fcid": "HMTFYDRXX",
"readgroup": {
"ID": "HMTFYDRXX.1",
"SM": "sampletest",
"PL": "ILLUMINA",
"PU": "HMTFYDRXX.1.ATTCCATAAG+TGCCTGGTGG"
},
"empty": false,
"single_end": true
},
[
"sampletest_S4_L001_R1_001.fastq.gz:md5,c16c7de1b7bffb5e4503f4d94c40f881"
]
]
],
[

],
[
"ControlMetricsOut.bin:md5,6d77b38d0793a6e1ce1e85706e488953",
"CorrectedIntMetricsOut.bin:md5,2bbf84d3be72734addaa2fe794711434",
"ErrorMetricsOut.bin:md5,38c88def138e9bb832539911affdb286",
"ExtractionMetricsOut.bin:md5,7497c3178837eea8f09350b5cd252e99",
"QMetricsOut.bin:md5,7e9f198d53ebdfbb699a5f94cf1ed51c",
"TileMetricsOut.bin:md5,83891751ec1c91a425a524b476b6ca3c"
"BasecallingMetricsOut.bin:md5,7fb651325cba614d497d376eaf43fef4",
"CorrectedIntMetricsOut.bin:md5,dc8d57282ba9ece9e5fc58a92aa2ac52",
"EmpiricalPhasingMetricsOut.bin:md5,1ef4631faf0a3a3beb31b10fc38a734d",
"EventMetricsOut.bin:md5,dee320ce29bdadde44589aa9439f53ab",
"ExtendedTileMetricsOut.bin:md5,f01d1a9cf8445adf719e652ad7304cf2",
"ExtractionMetricsOut.bin:md5,972f4082ad950baaf42a6d28517d28a8",
"FWHMGridMetricsOut.bin:md5,6e297bafcd845bfd0440d08e1bb27685",
"ImageMetricsOut.bin:md5,ac5d1f0a1f611c0c7c9dd8e6b9e701b1",
"OpticalModelMetricsOut.bin:md5,3eaea5fcf2d353950b1e720c73695ccb",
"PFGridMetricsOut.bin:md5,ae469858ee96ffafbcaf3afb814bdab2",
"QMetrics2030Out.bin:md5,438248760db58917b32f4eccc6c64c39",
"QMetricsByLaneOut.bin:md5,e8254cb4a27846710a2a143296be2d8f",
"QMetricsOut.bin:md5,8f6b83028a42be721200a598161ac5c6",
"RegistrationMetricsOut.bin:md5,b5ebd957aed067b6403d851ba2ce0139",
"TileMetricsOut.bin:md5,21388348d81fa9be326d30ef6d348464"
]
],
"meta": {
"nf-test": "0.8.4",
"nextflow": "24.04.2"
},
"timestamp": "2024-07-10T11:37:10.291289677"
"timestamp": "2024-06-26T20:28:00.234964"
},
"bcl2fastq": {
"content": [
Expand Down Expand Up @@ -123,14 +185,15 @@
{
"id": "Sample1_S1_L001",
"samplename": "Sample1",
"fcid": "test",
"lane": 1,
"readgroup": {
"ID": "000000000-K9H97.1",
"PU": "000000000-K9H97.1",
"SM": "Sample1",
"PL": "ILLUMINA",
"SM": "Sample1"
"PU": "000000000-K9H97.1"
},
"fcid": "test",
"lane": 1,
"empty": false,
"single_end": true
},
[
Expand Down Expand Up @@ -167,6 +230,6 @@
"nf-test": "0.8.4",
"nextflow": "23.10.1"
},
"timestamp": "2024-05-07T09:01:39.665409003"
"timestamp": "2024-06-26T20:28:19.854819"
}
}
Loading

0 comments on commit cc87c4d

Please sign in to comment.