diff --git a/CHANGES.md b/CHANGES.md index 5b61995..d3a7ab9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,12 @@ # CHANGES +## 5.2.0 + +* Expose option for "legacy" to allow for <=5.0.5 processing methods. + * `bamtofastq` when pulling reads from BAM/CRAM input. + * `bammarkduplicates2` for duplicate marking. + * Affects `bwa_mem.pl` and `merge_or_mark.pl` + ## 5.1.0 * Base image updated to Focal (Ubuntu 20.04). diff --git a/Dockerfile b/Dockerfile index c112cfa..8265e02 100644 --- a/Dockerfile +++ b/Dockerfile @@ -62,7 +62,7 @@ FROM ubuntu:20.04 LABEL maintainer="cgphelp@sanger.ac.uk"\ uk.ac.sanger.cgp="Cancer, Ageing and Somatic Mutation, Wellcome Sanger Institute" \ - version="5.1.0" \ + version="5.2.0" \ description="pcap-core" ENV OPT /opt/wtsi-cgp diff --git a/bin/bwa_mem.pl b/bin/bwa_mem.pl index 6f7afe4..0bab214 100755 --- a/bin/bwa_mem.pl +++ b/bin/bwa_mem.pl @@ -136,6 +136,7 @@ sub setup { 'qf|mmqcfrac:f' => \$opts{'mmqcfrac'}, 'bm2|bwamem2' => \$opts{'bwamem2'}, 'd|dupmode:s' => \$opts{'dupmode'}, + 'legacy' => \$opts{'legacy'}, 'ss|seqslice:i' => $opts{'seqslice'}, ) or pod2usage(2); @@ -178,6 +179,11 @@ sub setup { delete $opts{'mmqc'} unless(defined $opts{'mmqc'}); delete $opts{'csi'} unless(defined $opts{'csi'}); delete $opts{'bwamem2'} unless(defined $opts{'bwamem2'}); + delete $opts{'legacy'} unless(defined $opts{'legacy'}); + + if(defined $opts{'bwamem2'} && defined $opts{'legacy'}) { + warn "WARN: Use of options bwamem2 and legacy is suboptimal, proceeding but memory will be excessive.\n"; + } PCAP::Cli::opt_requires_opts('scramble', \%opts, ['cram']); @@ -243,7 +249,7 @@ =head1 SYNOPSIS -threads -t Number of threads to use. [1] Optional parameters: - -bwamem2 -bm2 Use bwa-mem2 instead of bwa. + -bwamem2 -bm2 Use bwa-mem2 instead of bwa (experimental). -fragment -f Split input into fragments of X million repairs [10] - only applies to fastq[.gz] input -nomarkdup -n Don't mark duplicates [flag] @@ -261,6 +267,10 @@ =head1 SYNOPSIS - Please see 'bwa_mem.pl -m' -mmqcfrac -qf Mismatch fraction for -mmqc [0.05] -dupmode -d see "samtools markdup -m" [t] + -legacy Equivalent to PCAP-core<=5.0.5 + - bamtofastq instead of samtools collate (for BAM/CRAM input) + - dupmode ignored as uses bammarkduplicates2 + - Avoid use with bwamem2 (memory explosion) Targeted processing: -process -p Only process this step then exit, optionally set -index @@ -360,6 +370,16 @@ =head2 OPTIONAL parameters Disables duplicate marking, switching bammarkduplicates2 for bammerge. +=item B<-dupmode> + +Switch between template and sequence based marking. See "samtools markdup" man page for more details + +=item B<-legacy> + +Processing equivalent to versions of PCAP-core <= 5.0.5 (bamtofastq + bammarkduplicates2) + +Not recommended with bwamem2 - memory explosions. + =item B<-csi> User CSI style index for final BAM file instead of default BAI. diff --git a/bin/merge_or_mark.pl b/bin/merge_or_mark.pl index a87c48f..f5b0cc2 100755 --- a/bin/merge_or_mark.pl +++ b/bin/merge_or_mark.pl @@ -90,6 +90,7 @@ sub setup { 'c|cram' => \$opts{'cram'}, 'sc|scramble=s' => \$opts{'scramble'}, 'd|dupmode:s' => \$opts{'dupmode'}, + 'legacy' => \$opts{'legacy'}, 'ss|seqslice:i' => $opts{'seqslice'}, ) or pod2usage(2); @@ -121,6 +122,7 @@ sub setup { delete $opts{'process'} unless(defined $opts{'process'}); delete $opts{'index'} unless(defined $opts{'index'}); + delete $opts{'legacy'} unless(defined $opts{'legacy'}); delete $opts{'scramble'}; delete $opts{'csi'} unless(defined $opts{'csi'}); if($opts{'qnamesort'} && !$opts{'nomarkdup'}){ @@ -183,7 +185,8 @@ =head1 SYNOPSIS -cram -c Output cram, see '-sc' [flag] -seqslice -ss seqs_per_slice for CRAM compression [samtools default: 10000] -scramble -sc DEPRECATED - -dupmode -d see "samtools markdup -m" [t] + -dupmode -d See "samtools markdup -m" [t] + -legacy Use legacy bammarkduplicates2, ignores '-dupmode' Targeted processing: -process -p Only process this step then exit diff --git a/lib/PCAP.pm b/lib/PCAP.pm index d75980c..817557c 100644 --- a/lib/PCAP.pm +++ b/lib/PCAP.pm @@ -28,7 +28,7 @@ use FindBin qw($Bin); use File::Which qw(which); # don't use autodie, only core perl in here -our $VERSION = '5.1.0'; +our $VERSION = '5.2.0'; our @EXPORT = qw($VERSION _which); const my $LICENSE => diff --git a/lib/PCAP/Bam.pm b/lib/PCAP/Bam.pm index 19ba85b..4cce3f1 100644 --- a/lib/PCAP/Bam.pm +++ b/lib/PCAP/Bam.pm @@ -154,8 +154,21 @@ sub merge_or_mark_lanes { else { my $merge = sprintf q{%s merge -u -@ %d - %s}, $tools{samtools}, $helper_threads, $input_str; - my $markdup = sprintf q{%s markdup --mode %s --output-fmt bam,level=0 -S --include-fails -T %s -@ %d -f %s.met - -}, + my $markdup; + if(exists $options->{legacy}) { + my $mmflagmod = _which('mmFlagModifier') || die "Unable to find 'mmFlagModifier' in path"; + my $bammarkdups = _which('bammarkduplicates2') || die "Unable to find 'bammarkduplicates2' in path"; + + my $mmQcRemove = sprintf '%s --remove -l 0 -@ %d', $mmflagmod, $helper_threads; + my $bammarkdup = sprintf '%s tmpfile=%s M=%s.met level=0 markthreads=%d', $bammarkdups, $strmd_tmp, $marked, $helper_threads; + my $mmQcReplace = sprintf '%s --replace -l 0 -@ %d', $mmflagmod, $helper_threads; + + $markdup = sprintf q{%s | %s | %s}, $mmQcRemove, $bammarkdup, $mmQcReplace; + } + else { + $markdup = sprintf q{%s markdup --mode %s --output-fmt bam,level=0 -S --include-fails -T %s -@ %d -f %s.met - -}, $tools{samtools}, $options->{dupmode}, $strmd_tmp, $helper_threads, $marked; + } my $compress = sprintf q{%s view -T %s --output-fmt %s -@ %d -}, $tools{samtools}, $options->{reference}, $out_fmt, $helper_threads; my $idx = sprintf q{%s index -@ %d %s - %s.%s}, @@ -254,8 +267,21 @@ sub merge_and_mark_dup { else { my $merge = sprintf q{%s merge -u -@ %d - %s}, $tools{samtools}, $helper_threads, $input_str; - my $markdup = sprintf q{%s markdup --mode %s --output-fmt bam,level=0 -S --include-fails -T %s -@ %d -f %s.met - -}, + my $markdup; + if(exists $options->{legacy}) { + my $mmflagmod = _which('mmFlagModifier') || die "Unable to find 'mmFlagModifier' in path"; + my $bammarkdups = _which('bammarkduplicates2') || die "Unable to find 'bammarkduplicates2' in path"; + + my $mmQcRemove = sprintf '%s --remove -l 0 -@ %d', $mmflagmod, $helper_threads; + my $bammarkdup = sprintf '%s tmpfile=%s M=%s.met level=0 markthreads=%d', $bammarkdups, $strmd_tmp, $marked, $helper_threads; + my $mmQcReplace = sprintf '%s --replace -l 0 -@ %d', $mmflagmod, $helper_threads; + + $markdup = sprintf q{%s | %s | %s}, $mmQcRemove, $bammarkdup, $mmQcReplace; + } + else { + $markdup = sprintf q{%s markdup --mode %s --output-fmt bam,level=0 -S --include-fails -T %s -@ %d -f %s.met - -}, $tools{samtools}, $options->{dupmode}, $strmd_tmp, $helper_threads, $marked; + } my $compress = sprintf q{%s view -T %s --output-fmt %s -@ %d -}, $tools{samtools}, $options->{reference}, $out_fmt, $helper_threads; my $idx = sprintf q{%s index -@ %d %s - %s.%s}, diff --git a/lib/PCAP/Bwa.pm b/lib/PCAP/Bwa.pm index a963fb9..048d60a 100644 --- a/lib/PCAP/Bwa.pm +++ b/lib/PCAP/Bwa.pm @@ -214,9 +214,19 @@ sub split_in { my $samtools = _which('samtools') || die "Unable to find 'samtools' in path"; my $mmQcStrip = sprintf '%s --remove -l 0 -@ %d -i %s', _which('mmFlagModifier'), $helpers, $input->in; my $view = sprintf '%s view %s -bu -T %s -F 2816 -@ %d -', $samtools, $TAG_STRIP, $options->{'reference'}, $helpers; # leave - my $collate = sprintf '%s collate -Ou -@ %d - %s/collate', $samtools, $helpers, $collate_folder; - my $split = sprintf '%s split --output-fmt bam,level=1 -@ %d -u %s/unknown.bam -f %s/%%!_i.bam -', $samtools, $helpers, $split_folder, $split_folder; - my $cmd = sprintf '%s | %s | %s | %s', $mmQcStrip, $view, $collate, $split; + my $collate_split; + if(exists $options->{legacy}) { + my $bamtofastq = _which('bamtofastq') || die "Unable to find 'bamtofastq' in path"; + $collate_split = sprintf '%s exclude=QCFAIL,SECONDARY,SUPPLEMENTARY tryoq=1 gz=1 level=1 outputperreadgroup=1 outputperreadgroupsuffixF=_i.fq outputperreadgroupsuffixF2=_i.fq T=%s/bamtofastq outputdir=%s split=%s', + $bamtofastq, $collate_folder, $split_folder, + $fragment_size * $MILLION * $BAM_MULT; + } + else { + my $collate = sprintf '%s collate -Ou -@ %d - %s/collate', $samtools, $helpers, $collate_folder; + my $split = sprintf '%s split --output-fmt bam,level=1 -@ %d -u %s/unknown.bam -f %s/%%!_i.bam -', $samtools, $helpers, $split_folder, $split_folder; + $collate_split = sprintf '%s | %s', $collate, $split; + } + my $cmd = sprintf '%s | %s | %s', $mmQcStrip, $view, $collate_split; # treat as interleaved fastq push @commands, 'set -o pipefail'; push @commands, $cmd; @@ -314,9 +324,15 @@ sub bwa_mem { } } else { - # bam/cram - my $tofastq = sprintf '%s fastq -@ %d -N %s', $tools{samtools}, $threads, $split; - $bwa = sprintf '%s | %s /dev/stdin', $tofastq, $bwa; + # due to legacy processing need to handle bam or fastq input here + if($split =~ m/\.gz$/) { + $bwa .= ' '.$split; + } + else { + # bam/cram + my $tofastq = sprintf '%s fastq -@ %d -N %s', $tools{samtools}, $threads, $split; + $bwa = sprintf '%s | %s /dev/stdin', $tofastq, $bwa; + } } my $sorted_bam_stub = $split;