diff --git a/CHANGES.md b/CHANGES.md index f6cb152..8b56576 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,7 @@ +### 1.9.0 +* Fix bugs #52 and #53 +* Modified bwa_mem.pl to accept multi-readgroup BAM as input + ### 1.7.1 * Turns out BWA mem still requires fixmates to get proper isize distributions * bumped biobambam to [0.0.191](https://github.com/gt1/biobambam/releases/tag/0.0.191-release-20150401083643) diff --git a/MYMETA.json b/MYMETA.json index d1622d1..e42efec 100644 --- a/MYMETA.json +++ b/MYMETA.json @@ -4,7 +4,7 @@ "unknown" ], "dynamic_config" : 0, - "generated_by" : "ExtUtils::MakeMaker version 6.68, CPAN::Meta::Converter version 2.131560", + "generated_by" : "ExtUtils::MakeMaker version 6.68, CPAN::Meta::Converter version 2.142690", "license" : [ "gpl_2" ], @@ -57,5 +57,5 @@ } }, "release_status" : "stable", - "version" : "v1.8.2" + "version" : "v1.9.0" } diff --git a/MYMETA.yml b/MYMETA.yml index 9fc022e..b99173e 100644 --- a/MYMETA.yml +++ b/MYMETA.yml @@ -3,40 +3,40 @@ abstract: unknown author: - unknown build_requires: - ExtUtils::MakeMaker: 0 + ExtUtils::MakeMaker: '0' configure_requires: - ExtUtils::MakeMaker: 0 + ExtUtils::MakeMaker: '0' dynamic_config: 0 -generated_by: 'ExtUtils::MakeMaker version 6.68, CPAN::Meta::Converter version 2.131560' +generated_by: 'ExtUtils::MakeMaker version 6.68, CPAN::Meta::Converter version 2.142690' license: gpl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html - version: 1.4 + version: '1.4' name: PCAP no_index: directory: - t - inc requires: - Bio::DB::Sam: 1.39 - Bio::Root::Version: 1.006923 - Capture::Tiny: 0.24 - Config::IniFiles: 2.83 - Const::Fast: 0.014 - Data::UUID: 1.219 - Devel::Cover: 1.09 - File::Which: 0.05 - GD: 2.52 - IPC::System::Simple: 1.25 - List::Util: 1.38 - Math::Gradient: 0.04 - Module::Build: 0.42 - Pod::Coverage: 0.23 - Proc::PID::File: 1.27 - Proc::ProcessTable: 0.5 - Sub::Exporter::Progressive: 0.001011 - Term::UI: 0.42 - Test::Fatal: 0.013 - Try::Tiny: 0.19 - XML::Simple: 2.2 -version: v1.8.2 + Bio::DB::Sam: '1.39' + Bio::Root::Version: '1.006923' + Capture::Tiny: '0.24' + Config::IniFiles: '2.83' + Const::Fast: '0.014' + Data::UUID: '1.219' + Devel::Cover: '1.09' + File::Which: '0.05' + GD: '2.52' + IPC::System::Simple: '1.25' + List::Util: '1.38' + Math::Gradient: '0.04' + Module::Build: '0.42' + Pod::Coverage: '0.23' + Proc::PID::File: '1.27' + Proc::ProcessTable: '0.5' + Sub::Exporter::Progressive: '0.001011' + Term::UI: '0.42' + Test::Fatal: '0.013' + Try::Tiny: '0.19' + XML::Simple: '2.2' +version: v1.9.0 diff --git a/bin/bwa_mem.pl b/bin/bwa_mem.pl index cb87d12..1d1cf23 100755 --- a/bin/bwa_mem.pl +++ b/bin/bwa_mem.pl @@ -24,6 +24,7 @@ use autodie qw(:all); use FindBin qw($Bin); use lib "$Bin/../lib"; +use Cwd qw(abs_path); use File::Path qw(remove_tree make_path); use Getopt::Long; @@ -51,7 +52,7 @@ my $options = setup(); my $threads = PCAP::Threaded->new($options->{'threads'}); - &PCAP::Threaded::disable_out_err if(exists $options->{'index'}); + &PCAP::Threaded::disable_out_err if(!exists $options->{'index'} && $options->{'threads'} == 1); # register processes $threads->add_function('split', \&PCAP::Bwa::split_in); @@ -130,7 +131,9 @@ sub setup { make_path($logs) unless(-d $logs); $opts{'tmp'} = $tmpdir; - $opts{'raw_files'} = \@ARGV; + for(@ARGV) { + push @{$opts{'raw_files'}}, abs_path($_); + } my $max_split = PCAP::Bwa::mem_prepare(\%opts); diff --git a/bin/gnos_pull.pl b/bin/gnos_pull.pl index b91e327..e69be30 100755 --- a/bin/gnos_pull.pl +++ b/bin/gnos_pull.pl @@ -35,11 +35,10 @@ use Const::Fast qw(const); use File::Copy qw(move); use File::Fetch; -use File::Path qw(make_path); +use File::Path qw(make_path remove_tree); use File::Spec; use File::Which qw(which); use IO::File; -use IO::Uncompress::Gunzip qw(gunzip $GunzipError); use JSON qw(decode_json); use List::Util qw(first any); use Proc::PID::File; @@ -50,7 +49,7 @@ const my @ANALYSIS_TYPES => (qw(ALIGNMENTS CALLS)); const my @AVAILABLE_COMPOSITE_FILTERS => (qw(max_dataset_GB multi_tumour sanger_version jamboree_approved manual_donor_blacklist)); const my $DEFAULT_URL => 'http://pancancer.info/gnos_metadata/latest'; -const my $GTDL_COMMAND => '%s%s --max-children 4 --rate-limit 200 -v -c %s -d %scghub/data/analysis/download/%s -p %s'; +const my $GTDL_COMMAND => '%s%s --max-children 3 --rate-limit 200 -v -c %s -d %scghub/data/analysis/download/%s -p %s'; { my $options = option_builder(); @@ -191,6 +190,24 @@ sub pull_data { } } +sub check_or_create_symlink { + my ($source, $target) = @_; + unless(-e $source) { + die "FATAL: Source file doesn't exist: $source\n"; + } + if(-l $target) { + my $existing_target = readlink $target; + if($source ne $existing_target) { + unlink $target; + symlink($source, $target); + } + } + else { + symlink($source, $target); + } + return 1; +} + sub pull_alignments { my ($options, $donor, $outbase, $donor_base) = @_; @@ -228,11 +245,17 @@ sub pull_bam { my $orig_bam = $f_base.'/'.$bam_data->{'aligned_bam'}->{'bam_file_name'}; my $sym_bam = $sample_base.'/'.$bam_data->{'aliquot_id'}.'.bam'; - return if(-e $success); + if(-e $success) { + check_or_create_symlink($orig_bam, $sym_bam); + check_or_create_symlink($orig_bam.'.bai', $sym_bam.'.bai'); + create_bas($repo, $gnos_id, $sym_bam); + return; + } + return if($options->{'symlinks'}); my $out_file = "$f_base.out.log"; my $err_file = "$f_base.err.log"; - $download .= "> $f_base.out.log"; + $download .= " > $f_base.out.log"; $download = "($download) >& $err_file"; warn "Executing: $download\n"; @@ -248,27 +271,45 @@ sub pull_bam { unlink $out_file; unlink $err_file; - symlink($orig_bam, $sym_bam) unless(-e $sym_bam); - symlink($orig_bam.'.bai', $sym_bam.'.bai') unless(-e $sym_bam.'.bai'); - - # pull the bas file, done here to handle back fill of this data - my $get_bas = sprintf '%s %s/xml_to_bas.pl -d %scghub/metadata/analysisFull/%s -o %s -b %s', - $^X, - $Bin, - $repo, - $gnos_id, - $sym_bam.'.bas', - $sym_bam; - warn "Executing: $get_bas\n"; - my ($stdout, $stderr, $exit_c) = capture { system($get_bas); }; - - die "A problem occured while executing: $get_bas\n\nSTDOUT:\n$stdout\n\nSTDERR:$stderr\n" if($exit_c); + check_or_create_symlink($orig_bam, $sym_bam); + check_or_create_symlink($orig_bam.'.bai', $sym_bam.'.bai'); + create_bas($repo, $gnos_id, $sym_bam); # touch a success file in the output loc open my $S, '>', $success; close $S; return 1; } +sub create_bas { + my ($repo, $gnos_id, $sym_bam) = @_; + my $bas_file = $sym_bam.'.bas'; + unlink $bas_file if(-l $bas_file); + return if(-s $bas_file); + # pull the bas file, done here to handle back fill of this data + my $get_bas = sprintf '%s %s/xml_to_bas.pl -d %scghub/metadata/analysisFull/%s -o %s -b %s', + $^X, + $Bin, + $repo, + $gnos_id, + $bas_file, + $sym_bam; + my $success = 0; + for(0..5) { + warn "Executing: $get_bas\n"; + my ($stdout, $stderr, $exit_c) = capture { system($get_bas); }; + if($exit_c) { + warn "A problem occured while executing: $get_bas\n\nSTDOUT:\n$stdout\n\nSTDERR:$stderr\n...Retry\n"; + sleep 30; + } + else { + $success++; + last; + } + } + die "Failed after multiple attempts, aborting bas generation using $get_bas" unless($success); + return 1; +} + sub pull_calls { my ($options, $donor, $outbase, $donor_base) = @_; my @callers = @{$donor->{'flags'}->{'variant_calling_performed'}}; @@ -296,7 +337,12 @@ sub pull_calls { $outbase; my $f_base = $outbase.'/'.$gnos_id; my $success = $f_base.'/.success.t'; - next if(-e $success); + if(-e $success) { + make_path($donor_base) unless(-e $donor_base); + check_or_create_symlink($f_base, "$donor_base/$caller"); + next; + } + next if($options->{'symlinks'}); my $err_file = "$f_base.err.log"; my $out_fh = IO::File->new("$f_base.out.log", "w+"); @@ -315,7 +361,7 @@ sub pull_calls { # now build symlinks make_path($donor_base) unless(-e $donor_base); - symlink($f_base, "$donor_base/$caller") unless(-e "$donor_base/$caller"); + check_or_create_symlink($f_base, "$donor_base/$caller"); # touch a success file in the output loc open my $S, '>', $success; close $S; @@ -356,7 +402,9 @@ sub load_data { my @to_process; my %project_dist; my @filter_keys = keys %{$options->{'FILTERS'}}; - my $z = IO::Uncompress::Gunzip->new($gz_manifest, MultiStream => 1) or die "IO::Uncompress::Gunzip failed: $GunzipError\n"; + my $command = sprintf 'gunzip -c %s', $gz_manifest; + my ($pid, $z); + $pid = open $z, q{-|}, $command or croak 'Could not fork: '.$!; DONOR: while(my $jsonl = <$z>) { my $donor = decode_json($jsonl); @@ -489,7 +537,7 @@ sub manifest { make_path($options->{'outdir'}) unless(-e $options->{'outdir'}); $ff = File::Fetch->new(uri => $to_get); $where = $ff->fetch( to => $options->{'outdir'}); - my $dest = $options->{'outdir'}.'/donor_p.jsonl.gz'; + my $dest = $options->{'outdir'}.'/'.$options->{'analysis'}.'_donor_p.jsonl.gz'; move($where, $dest); return $dest; } @@ -503,6 +551,7 @@ sub option_builder { 'h|help' => \$opts{'h'}, 'm|man' => \$opts{'m'}, 'i|info' => \$opts{'info'}, + 's|symlinks' => \$opts{'symlinks'}, 'u|url=s' => \$opts{'url'}, 't|threads=i' => \$opts{'threads'}, 'a|analysis=s' => \$opts{'analysis'}, @@ -551,6 +600,8 @@ =head1 SYNOPSIS Other options: + --symlinks (-s) Rebuild symlinks only. + --threads (-t) Number of parallel GNOS retrievals. --url (-u) The base URL to retrieve jsonl file from diff --git a/c/Makefile b/c/Makefile index 775faa8..8b7a416 100644 --- a/c/Makefile +++ b/c/Makefile @@ -1,4 +1,4 @@ -VERSION=1.8.2 +VERSION=1.9.0 #Compiler CC = gcc -O3 -DVERSION='"$(VERSION)"' -g diff --git a/docs.tar.gz b/docs.tar.gz index 9fdab34..d63b2de 100644 Binary files a/docs.tar.gz and b/docs.tar.gz differ diff --git a/lib/PCAP.pm b/lib/PCAP.pm index 8be7b43..f1a714b 100644 --- a/lib/PCAP.pm +++ b/lib/PCAP.pm @@ -24,7 +24,7 @@ use strict; use Const::Fast qw(const); use base 'Exporter'; -our $VERSION = '1.8.2'; +our $VERSION = '1.9.0'; our @EXPORT = qw($VERSION); const my $LICENSE => @@ -68,6 +68,7 @@ const my %UPGRADE_PATH => ( '0.1.0' => 'biobambam,bwa,samtools', '1.8.0' => '', '1.8.1' => '', '1.8.2' => '', + '1.9.0' => '', ); sub license { diff --git a/lib/PCAP/Bam.pm b/lib/PCAP/Bam.pm index fd9d835..e2218b7 100644 --- a/lib/PCAP/Bam.pm +++ b/lib/PCAP/Bam.pm @@ -54,13 +54,15 @@ sub new { } sub rg_line_for_output { - my ($bam, $sample, $uniq_id) = @_; + my ($bam, $sample, $uniq_id, $existing_rgid) = @_; my $sam = sam_ob($bam); my $header = $sam->header->text; my $rg_line; while($header =~ m/^(\@RG\t[^\n]+)/xmsg) { my $new_rg = $1; - die "BAM file appears to contain data for multiple readgroups, not supported: \n\n$header\n" if(defined $rg_line); + my ($this_id) = $new_rg =~ m/\tID:([^\t]+)/; + next if(defined $existing_rgid && $this_id ne $existing_rgid); + die "BAM file appears to contain data for multiple readgroups, not supported unless 'existing_rgid' is found: \n\n$header\n" if(defined $rg_line); $rg_line = $new_rg; if($uniq_id) { my $uuid = lc Data::UUID->new->create_str; @@ -135,6 +137,7 @@ sub bam_stats { my $tmp = $options->{'tmp'}; my $bam = File::Spec->catdir($options->{'outdir'}, $options->{'sample'}).'.bam';; my $bas = "$bam.bas"; + return $bas if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); my $command = _which('bam_stats') || die "Unable to find 'bam_stats' in path"; $command .= sprintf $BAM_STATS, $bam, $bas; if(exists $options->{'charts'} && defined $options->{'charts'}) { @@ -143,6 +146,7 @@ sub bam_stats { $command .= ' -p '.$chart_dir; } PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, 0); + PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); return $bas; } diff --git a/lib/PCAP/Bwa.pm b/lib/PCAP/Bwa.pm index 4921598..084f861 100644 --- a/lib/PCAP/Bwa.pm +++ b/lib/PCAP/Bwa.pm @@ -36,11 +36,13 @@ use File::Copy qw(copy move); use PCAP::Bwa::Meta; const my $BWA_ALN => q{ aln%s -t %s -f %s_%s.sai %s %s.%s}; -const my $BAMFASTQ => q{ exclude=QCFAIL,SECONDARY,SUPPLEMENTARY T=%s S=%s O=%s O2=%s gz=1 level=1 F=%s F2=%s filename=%s split=%s}; +const my $BAMFASTQ => q{ exclude=QCFAIL,SECONDARY,SUPPLEMENTARY tryoq=1 gz=1 level=1 outputperreadgroup=1 outputperreadgroupsuffixF=_i.fq outputperreadgroupsuffixF2=_i.fq T=%s outputdir=%s filename=%s split=%s}; const my $BWA_MEM => q{ mem%s -T 0 -R %s -t %s %s}; const my $ALN_TO_SORTED => q{ sampe -P -a 1000 -r '%s' %s %s_1.sai %s_2.sai %s.%s %s.%s | %s fixmate=1 inputformat=sam level=1 tmpfile=%s_tmp O=%s_sorted.bam}; const my $BAMSORT => q{ fixmate=1 inputformat=sam level=1 tmpfile=%s_tmp O=%s_sorted.bam inputthreads=%s outputthreads=%s calmdnm=1 calmdnmrecompindetonly=1 calmdnmreference=%s}; +const my $FALSE_RG => q{@RG\tID:%s\tSM:%s\tLB:default\tPL:ILLUMINA}; + const my $READPAIR_SPLITSIZE => 10, const my $PAIRED_FQ_LINE_MULT => 4; const my $INTERLEAVED_FQ_LINE_MULT => 8; @@ -99,7 +101,7 @@ sub mem_mapmax { opendir(my $dh, $folder); while(my $file = readdir $dh) { next if($file =~ m/^\./); - next if($file =~ m/^2\.[[:digit]]+/); # captured by 1.* + next if($file =~ m/^pairedfq2\.[[:digit:]]+/); # captured by 1.* push @files, File::Spec->catfile($folder, $file); } closedir($dh); @@ -137,32 +139,42 @@ sub split_in { if($input->fastq) { # paired fq input if($input->paired_fq) { - push @commands, sprintf 'split -a 3 -d -l %s %s %s.' - , $fragment_size * $MILLION * $PAIRED_FQ_LINE_MULT - , $input->in.'_1.'.$input->fastq - , File::Spec->catfile($split_folder, '1'); - push @commands, sprintf 'split -a 3 -d -l %s %s %s.' - , $fragment_size * $MILLION * $PAIRED_FQ_LINE_MULT - , $input->in.'_2.'.$input->fastq - , File::Spec->catfile($split_folder, '2'); + my $fq1 = $input->in.'_1.'.$input->fastq; + my $fq2 = $input->in.'_2.'.$input->fastq; + if($input->fastq =~ m/[.]gz$/) { + symlink $fq1, File::Spec->catfile($split_folder, 'pairedfq1.0.'.$input->fastq); + symlink $fq2, File::Spec->catfile($split_folder, 'pairedfq2.0.'.$input->fastq); + } + else { + push @commands, sprintf 'split -a 3 -d -l %s %s %s.' + , $fragment_size * $MILLION * $PAIRED_FQ_LINE_MULT + , $fq1 + , File::Spec->catfile($split_folder, 'pairedfq1'); + push @commands, sprintf 'split -a 3 -d -l %s %s %s.' + , $fragment_size * $MILLION * $PAIRED_FQ_LINE_MULT + , $fq2 + , File::Spec->catfile($split_folder, 'pairedfq2'); + } } # interleaved FQ else { - push @commands, sprintf 'split -a 3 -d -l %s %s %s.' - , $fragment_size * $MILLION * $INTERLEAVED_FQ_LINE_MULT - , $input->in.'.'.$input->fastq - , File::Spec->catfile($split_folder, 'i'); + my $fq_i = $input->in.'.'.$input->fastq; + if($input->fastq =~ m/[.]gz$/) { + symlink $fq_i, File::Spec->catfile($split_folder, 'i.'.$input->fastq); + } + else { + push @commands, sprintf 'split -a 3 -d -l %s %s %s.' + , $fragment_size * $MILLION * $INTERLEAVED_FQ_LINE_MULT + , $fq_i + , File::Spec->catfile($split_folder, 'i'); + } } } # if bam input else { my $bam2fq = which('bamtofastq') || die "Unable to find 'bamtofastq' in path"; $bam2fq .= sprintf $BAMFASTQ, File::Spec->catfile($tmp, "bamtofastq.$index"), - File::Spec->catfile($tmp, "bamtofastq.$index.s"), - File::Spec->catfile($tmp, "bamtofastq.$index.o1"), - File::Spec->catfile($tmp, "bamtofastq.$index.o2"), - File::Spec->catfile($split_folder, 'i'), - File::Spec->catfile($split_folder, 'i'), + $split_folder, $input->in, $fragment_size * $MILLION * $BAM_MULT; # treat as interleaved fastq @@ -204,7 +216,11 @@ sub bwa_mem { $rg_line = q{'}.$input->rg_header(q{\t}).q{'}; } else { - ($rg_line, undef) = PCAP::Bam::rg_line_for_output($input->in, $options->{'sample'}); + my ($rg) = $split =~ m|/split/[[:digit:]]+/(.+)_i.fq_[[:digit:]]+.gz$|; + ($rg_line, undef) = PCAP::Bam::rg_line_for_output($input->in, $options->{'sample'}, undef, $rg); + unless($rg_line) { + $rg_line = sprintf $FALSE_RG, $split_element, $options->{'sample'}; + } $rg_line = q{'}.$rg_line.q{'}; } @@ -225,7 +241,7 @@ sub bwa_mem { # uncoverable branch false if($input->paired_fq) { my $split2 = $split; - $split2 =~ s/1(\.[[:digit:]]+)$/2$1/; + $split2 =~ s/pairedfq1(\.[[:digit:]]+)/pairedfq2$1/; $bwa .= ' '.$split; $bwa .= ' '.$split2; } diff --git a/t/pcapBam.t b/t/pcapBam.t index d31c93d..2a02798 100644 --- a/t/pcapBam.t +++ b/t/pcapBam.t @@ -74,8 +74,11 @@ subtest 'rg line checks' => sub { my ($rg_line, $bio_db_sam) = PCAP::Bam::rg_line_for_output($test_bam); is($rg_line, $EXPECTED_RGLINE, 'Retreived single RG line'); like(exception{ PCAP::Bam::rg_line_for_output($multi_bam) } - , qr/BAM file appears to contain data for multiple readgroups, not supported:/m + , qr/BAM file appears to contain data for multiple readgroups, not supported unless 'existing_rgid' is found:/m , 'Fail when multiple readgroups in BAM'); + + is((PCAP::Bam::rg_line_for_output($multi_bam, undef, undef, 1))[0], $EXPECTED_RGLINE, 'Correct RG from multiple RG header'); + my $obj = new_ok($MODULE => [$test_bam]); is($obj->single_rg_value('PL'), $EXPECTED_RG_PL, 'Tag successfully retrieved'); $obj = new_ok($MODULE => [$multi_bam]);