diff --git a/CHANGES.md b/CHANGES.md index f2af464..ac2660b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,9 @@ # CHANGES +## 5.3.0 + +* Support for paired FASTQ names ending with `_R1_001` and `_R2_001` in addition to `_1` and `_2` as before. + ## 5.2.2 * Delete files that may remain after an aborted run before resume. diff --git a/Dockerfile b/Dockerfile index b93fd18..b28ee09 100644 --- a/Dockerfile +++ b/Dockerfile @@ -62,7 +62,7 @@ FROM ubuntu:20.04 LABEL maintainer="cgphelp@sanger.ac.uk"\ uk.ac.sanger.cgp="Cancer, Ageing and Somatic Mutation, Wellcome Sanger Institute" \ - version="5.2.2" \ + version="5.3.0" \ description="pcap-core" ENV OPT /opt/wtsi-cgp diff --git a/bin/bwa_aln.pl b/bin/bwa_aln.pl index de433d3..4f4cfc4 100755 --- a/bin/bwa_aln.pl +++ b/bin/bwa_aln.pl @@ -229,7 +229,7 @@ =head2 INPUT FILE TYPES =item f[ast]q A standard uncompressed fastq file. Requires a pair of inputs with standard suffix of '_1' and '_2' -immediately prior to '.f[ast]q'. +or 'R1_001' and 'R2_001' immediately prior to '.f[ast]q'. =item f[ast]q.gz diff --git a/bin/bwa_mem.pl b/bin/bwa_mem.pl index 0bab214..eb7b76c 100755 --- a/bin/bwa_mem.pl +++ b/bin/bwa_mem.pl @@ -462,8 +462,8 @@ =head2 INPUT FILE TYPES =item f[ast]q A standard uncompressed fastq file. Requires a pair of inputs with standard suffix of '_1' and '_2' -immediately prior to '.f[ast]q' or an interleaved f[ast]q file where read 1 and 2 are adjacent -in the file. +or 'R1_001' and 'R2_001' immediately prior to '.f[ast]q' or an interleaved f[ast]q file where read 1 +and 2 are adjacent in the file. =item f[ast]q.gz diff --git a/lib/PCAP.pm b/lib/PCAP.pm index bac4a3e..d53ebb0 100644 --- a/lib/PCAP.pm +++ b/lib/PCAP.pm @@ -28,7 +28,7 @@ use FindBin qw($Bin); use File::Which qw(which); # don't use autodie, only core perl in here -our $VERSION = '5.2.2'; +our $VERSION = '5.3.0'; our @EXPORT = qw($VERSION _which); const my $LICENSE => diff --git a/lib/PCAP/Bwa.pm b/lib/PCAP/Bwa.pm index b275661..14dbf44 100644 --- a/lib/PCAP/Bwa.pm +++ b/lib/PCAP/Bwa.pm @@ -175,8 +175,8 @@ sub split_in { if($input->fastq) { # paired fq input if($input->paired_fq) { - my $fq1 = $input->in.'_1.'.$input->fastq; - my $fq2 = $input->in.'_2.'.$input->fastq; + my $fq1 = $input->in.($input->illumina_fq ? '_R1_001.' : '_1.').$input->fastq; + my $fq2 = $input->in.($input->illumina_fq ? '_R2_001.' : '_2.').$input->fastq; if($input->fastq =~ m/[.]gz$/ || $fragment_size > 5000) { symlink $fq1, File::Spec->catfile($split_folder, 'pairedfq1.0.'.$input->fastq); symlink $fq2, File::Spec->catfile($split_folder, 'pairedfq2.0.'.$input->fastq); @@ -415,6 +415,7 @@ sub sampe { my $pathstub = $input_meta->[$index-1]->tstub; my $fastq = $input_meta->[$index-1]->fastq; + my $illumina_fq = $input_meta->[$index-1]->illumina_fq; my $command = _which('bwa') || die "Unable to find 'bwa' in path";; @@ -425,10 +426,10 @@ sub sampe { if(defined $fastq) { $command .= sprintf $ALN_TO_SORTED, $input_meta->[$index-1]->rg_header(q{\t}), $ref, - $pathstub, - $pathstub, - $input_meta->[$index-1]->in.'_1', $fastq, - $input_meta->[$index-1]->in.'_2', $fastq, + $pathstub.($illumina_fq ? '_R1_001' : '_1'), + $pathstub.($illumina_fq ? '_R2_001' : '_2'), + $input_meta->[$index-1]->in.($illumina_fq ? '_R1_001' : '_1'), $fastq, + $input_meta->[$index-1]->in.($illumina_fq ? '_R2_001' : '_2'), $fastq, $bamsort, $pathstub, $pathstub; @@ -436,8 +437,8 @@ sub sampe { else { $command .= sprintf $ALN_TO_SORTED, $input_meta->[$index-1]->rg_header(q{\t}), $ref, - $pathstub, - $pathstub, + $pathstub.'_1', + $pathstub.'_2', $pathstub, 'bam', $pathstub, 'bam', $bamsort, diff --git a/lib/PCAP/Bwa/Meta.pm b/lib/PCAP/Bwa/Meta.pm index 61fc16d..8257567 100644 --- a/lib/PCAP/Bwa/Meta.pm +++ b/lib/PCAP/Bwa/Meta.pm @@ -37,7 +37,7 @@ use YAML qw(LoadFile); use PCAP::Bam; -const my @INIT_KEYS => qw(in temp fastq paired_fq cram bam); +const my @INIT_KEYS => qw(in temp fastq paired_fq illumina_fq cram bam); const my @REQUIRED_KEYS => qw(in temp); const my @REQUIRED_RG_ELEMENTS => qw(SM); const my @ALLOWED_RG_ELEMENTS => qw(ID CN DS DT FO KS LB PG PI PL PM PU SM); @@ -98,6 +98,12 @@ sub paired_fq { return $self->{'paired_fq'}; # this will create a key but not worth adding overhead to prevent } +sub illumina_fq { + my $self = shift; + croak "'illumina_fq' can only be set via new()" if(scalar @_ > 0); + return $self->{'illumina_fq'}; +} + sub bam_or_cram { my $self = shift; if(exists $self->{'bam'} && defined $self->{'bam'} && $self->{'bam'} == 1) { @@ -271,10 +277,11 @@ sub files_to_meta { # must be paired fq next if(exists $seen_paired_stub{$fq_stub}); $seen_paired_stub{$fq_stub} = 1; - die "Unable to find file for read 1, for ${fq_stub}_X.${fq_ext}\n" unless(-e "${fq_stub}_1.$fq_ext"); - die "Unable to find file for read 2, for ${fq_stub}_X.${fq_ext}\n" unless(-e "${fq_stub}_2.$fq_ext"); - die "File for read 1 is empty: ${fq_stub}_X.${fq_ext}\n" unless(-s "${fq_stub}_1.$fq_ext"); - die "File for read 2 is empty: ${fq_stub}_X.${fq_ext}\n" unless(-s "${fq_stub}_2.$fq_ext"); + die "Unable to find file for read 1, for ${fq_stub}_X.${fq_ext}\n" unless(-e "${fq_stub}_1.$fq_ext" || -e "${fq_stub}_R1_001.$fq_ext"); + die "Unable to find file for read 2, for ${fq_stub}_X.${fq_ext}\n" unless(-e "${fq_stub}_2.$fq_ext" || -e "${fq_stub}_R2_001.$fq_ext"); + die "File for read 1 is empty: ${fq_stub}_X.${fq_ext}\n" unless(-s "${fq_stub}_1.$fq_ext" || -s "${fq_stub}_R1_001.$fq_ext"); + die "File for read 2 is empty: ${fq_stub}_X.${fq_ext}\n" unless(-s "${fq_stub}_2.$fq_ext" || -s "${fq_stub}_R2_001.$fq_ext"); + $meta->{'illumina_fq'} = ( $end=~m/R[12]_001/ ? 1 : 0 ); $meta->{'paired_fq'} = 1; $are_paired_fq = 1; } @@ -343,7 +350,7 @@ sub is_fastq_ext { sub parse_fastq_filename { my $fastq = shift; # shouldn't have extension by now my $end; - if($fastq =~ s/_([12])$//) { + if($fastq =~ s/_([12]|R[12]_001)$//) { $end = $1; } return ($fastq, $end); @@ -419,9 +426,9 @@ Added for future compatibility with files that can be used with 'BWA mem'. When input is some form of fastq this returns the extension, otherwise undef. -=item paired_fq +=item illumina_fq -When input is a paired fastq file this returns true. +If paired FASTQ names end with _R[12]_001, this returns true. Returns false if they end with _[12]. =item rg @@ -488,6 +495,9 @@ Understands the following as fastq files: Takes a fastq filename after removal of the extension (see L and determines if file is read1, read2 or interleaved fastq. -Expects file to end '_1' or '_2' for paired, assumed interleaved otherwise. +Expects paired FASTQ names to end with one the following. Assumes interleaved FASTQ otherwise. + + _1 and _2 - PCAP's original naming requirement + _R1_001 and _R2_001 - Illumina's naming requirement; Require 3 digit suffix to be 001 to avoid split FASTQs =back diff --git a/t/data/4_R1_001.fq b/t/data/4_R1_001.fq new file mode 100644 index 0000000..54b00f5 --- /dev/null +++ b/t/data/4_R1_001.fq @@ -0,0 +1,4 @@ +@HS1_10761:8:1312:1589:61870#79/1 +CTAACCCTAACCCTAACCCTAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTAAC ++ +ECEFEGFFDBHHEFCHFGGGGHHHFFGGGGGGDHIDGFFFEGEFH;FGECGGHGDHHAEEEBH:FADEIEEFAED diff --git a/t/data/4_R2_001.fq b/t/data/4_R2_001.fq new file mode 100644 index 0000000..b06c448 --- /dev/null +++ b/t/data/4_R2_001.fq @@ -0,0 +1,4 @@ +@HS1_10761:8:1312:1589:61870#79/2 +GGGTTAGGGTTAGGGTTAAGGGTTAGGGGTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTTAG ++ +>:B>EFFFC7FCHGF5HDHGFH5G4HFC4GFGGF(FFDHB>BFHCF3EEGE?3EGHEE333@GH*CBG;=(ABDC diff --git a/t/pcapBwaMeta.t b/t/pcapBwaMeta.t index 69a19cb..33ea38e 100644 --- a/t/pcapBwaMeta.t +++ b/t/pcapBwaMeta.t @@ -73,6 +73,12 @@ subtest 'Non-object funcions' => sub { ($base, $end) = PCAP::Bwa::Meta::parse_fastq_filename('fastq_2'); is($base, 'fastq', "fastq_2 file should have '_2' removed"); is($end, '2', "Read end should be 2"); + ($base, $end) = PCAP::Bwa::Meta::parse_fastq_filename('fastq_R1_001'); + is($base, 'fastq', "fastq_R1_001 has a basename of 'fastq'"); + is($end, 'R1_001', "file ending with R1_001 indicates read 1 of pair"); + ($base, $end) = PCAP::Bwa::Meta::parse_fastq_filename('fastq_R2_001'); + is($base, 'fastq', "fastq_R2_001 has a basename of 'fastq'"); + is($end, 'R2_001', "file ending with R2_001 indicates read 2 of a pair"); ($base, $end) = PCAP::Bwa::Meta::parse_fastq_filename('fastq_3'); isnt($base, 'fastq', "fastq_3 file should be unchanged, assume interleaved"); ok(!defined $end, "_3, so not defined"); @@ -134,6 +140,9 @@ subtest 'Objects from file list' => sub { is(PCAP::Bwa::Meta::files_to_meta($tmp, [ File::Spec->catfile($test_data, '1_1.fq') , File::Spec->catfile($test_data, '1_2.fq')], 'sample')->[0]->{'paired_fq'} , 1, 'Paired fq identified as paired'); + is(PCAP::Bwa::Meta::files_to_meta($tmp, [ File::Spec->catfile($test_data, '4_R1_001.fq') + , File::Spec->catfile($test_data, '4_R2_001.fq')], 'sample')->[0]->{'paired_fq'} + , 1, 'Paired fastqs with Illumina names identified as paired'); like( exception{ PCAP::Bwa::Meta::files_to_meta($tmp , [ File::Spec->catfile($test_data, '2_1.fq') , File::Spec->catfile($test_data, '2_2.fq')] @@ -173,6 +182,13 @@ subtest 'Objects from file list' => sub { , 'sample') } , qq{ERROR: BAM|CRAM, paired FASTQ and interleaved FASTQ file types cannot be mixed, please choose one type\n} , 'Fail when inputs are mixed file types'); + is( exception{ PCAP::Bwa::Meta::files_to_meta($tmp + , [ File::Spec->catfile($test_data, '4_R1_001.fq') + , File::Spec->catfile($test_data, '4_R2_001.fq') + , File::Spec->catfile($test_data, '1.fq')] + , 'sample') } + , qq{ERROR: BAM|CRAM, paired FASTQ and interleaved FASTQ file types cannot be mixed, please choose one type\n} + , 'Fail when inputs mix interleaves with paired per illumina'); like( exception{ PCAP::Bwa::Meta::files_to_meta($tmp, [ File::Spec->catfile($test_data, 'wibble.wobble')] , 'sample') } , qr/.+ is not an expected input file type.\n/m @@ -248,6 +264,11 @@ subtest 'Accessors' => sub { , qr/'paired_fq' can only be set via new\(\)/ , 'Fail to directly set paired_fq'); + is($meta->illumina_fq, 0, 'Expected non illumina fastq names'); + like( exception { $meta->illumina_fq(1) } + , qr/'illumina_fq' can only be set via new\(\)/ + , 'Fail to directly set illumina_fq'); + my $tstub = File::Spec->catfile($tmp, '2'); # 2 as files_to_meta called twice is($meta->tstub, $tstub, 'Generate expected tstub');