Skip to content

Commit

Permalink
Merge pull request #83 from ICGC-TCGA-PanCancer/feature/remapBam
Browse files Browse the repository at this point in the history
Feature/remap bam
  • Loading branch information
keiranmraine committed Aug 3, 2015
2 parents b96d515 + 6cb4482 commit ec9801d
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 36 deletions.
7 changes: 5 additions & 2 deletions bin/bwa_mem.pl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
use autodie qw(:all);
use FindBin qw($Bin);
use lib "$Bin/../lib";
use Cwd qw(abs_path);

use File::Path qw(remove_tree make_path);
use Getopt::Long;
Expand Down Expand Up @@ -51,7 +52,7 @@
my $options = setup();

my $threads = PCAP::Threaded->new($options->{'threads'});
&PCAP::Threaded::disable_out_err if(exists $options->{'index'});
&PCAP::Threaded::disable_out_err if(!exists $options->{'index'} && $options->{'threads'} == 1);

# register processes
$threads->add_function('split', \&PCAP::Bwa::split_in);
Expand Down Expand Up @@ -130,7 +131,9 @@ sub setup {
make_path($logs) unless(-d $logs);

$opts{'tmp'} = $tmpdir;
$opts{'raw_files'} = \@ARGV;
for(@ARGV) {
push @{$opts{'raw_files'}}, abs_path($_);
}

my $max_split = PCAP::Bwa::mem_prepare(\%opts);

Expand Down
33 changes: 22 additions & 11 deletions bin/gnos_pull.pl
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
use Const::Fast qw(const);
use File::Copy qw(move);
use File::Fetch;
use File::Path qw(make_path);
use File::Path qw(make_path remove_tree);
use File::Spec;
use File::Which qw(which);
use IO::File;
Expand Down Expand Up @@ -287,15 +287,26 @@ sub create_bas {
return if(-s $bas_file);
# pull the bas file, done here to handle back fill of this data
my $get_bas = sprintf '%s %s/xml_to_bas.pl -d %scghub/metadata/analysisFull/%s -o %s -b %s',
$^X,
$Bin,
$repo,
$gnos_id,
$bas_file,
$sym_bam;
warn "Executing: $get_bas\n";
my ($stdout, $stderr, $exit_c) = capture { system($get_bas); };
die "A problem occured while executing: $get_bas\n\nSTDOUT:\n$stdout\n\nSTDERR:$stderr\n" if($exit_c);
$^X,
$Bin,
$repo,
$gnos_id,
$bas_file,
$sym_bam;
my $success = 0;
for(0..5) {
warn "Executing: $get_bas\n";
my ($stdout, $stderr, $exit_c) = capture { system($get_bas); };
if($exit_c) {
warn "A problem occured while executing: $get_bas\n\nSTDOUT:\n$stdout\n\nSTDERR:$stderr\n...Retry\n";
sleep 30;
}
else {
$success++;
last;
}
}
die "Failed after multiple attempts, aborting bas generation using $get_bas" unless($success);
return 1;
}

Expand Down Expand Up @@ -589,7 +600,7 @@ =head1 SYNOPSIS
Other options:
--symlinks (-s) Rebuild symlinks only
--symlinks (-s) Rebuild symlinks only.
--threads (-t) Number of parallel GNOS retrievals.
Expand Down
8 changes: 6 additions & 2 deletions lib/PCAP/Bam.pm
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,15 @@ sub new {
}

sub rg_line_for_output {
my ($bam, $sample, $uniq_id) = @_;
my ($bam, $sample, $uniq_id, $existing_rgid) = @_;
my $sam = sam_ob($bam);
my $header = $sam->header->text;
my $rg_line;
while($header =~ m/^(\@RG\t[^\n]+)/xmsg) {
my $new_rg = $1;
die "BAM file appears to contain data for multiple readgroups, not supported: \n\n$header\n" if(defined $rg_line);
my ($this_id) = $new_rg =~ m/\tID:([^\t]+)/;
next if(defined $existing_rgid && $this_id ne $existing_rgid);
die "BAM file appears to contain data for multiple readgroups, not supported unless 'existing_rgid' is found: \n\n$header\n" if(defined $rg_line);
$rg_line = $new_rg;
if($uniq_id) {
my $uuid = lc Data::UUID->new->create_str;
Expand Down Expand Up @@ -135,6 +137,7 @@ sub bam_stats {
my $tmp = $options->{'tmp'};
my $bam = File::Spec->catdir($options->{'outdir'}, $options->{'sample'}).'.bam';;
my $bas = "$bam.bas";
return $bas if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0);
my $command = _which('bam_stats') || die "Unable to find 'bam_stats' in path";
$command .= sprintf $BAM_STATS, $bam, $bas;
if(exists $options->{'charts'} && defined $options->{'charts'}) {
Expand All @@ -143,6 +146,7 @@ sub bam_stats {
$command .= ' -p '.$chart_dir;
}
PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, 0);
PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0);
return $bas;
}

Expand Down
58 changes: 37 additions & 21 deletions lib/PCAP/Bwa.pm
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,13 @@ use File::Copy qw(copy move);
use PCAP::Bwa::Meta;

const my $BWA_ALN => q{ aln%s -t %s -f %s_%s.sai %s %s.%s};
const my $BAMFASTQ => q{ exclude=QCFAIL,SECONDARY,SUPPLEMENTARY T=%s S=%s O=%s O2=%s gz=1 level=1 F=%s F2=%s filename=%s split=%s};
const my $BAMFASTQ => q{ exclude=QCFAIL,SECONDARY,SUPPLEMENTARY tryoq=1 gz=1 level=1 outputperreadgroup=1 outputperreadgroupsuffixF=_i.fq outputperreadgroupsuffixF2=_i.fq T=%s outputdir=%s filename=%s split=%s};
const my $BWA_MEM => q{ mem%s -T 0 -R %s -t %s %s};
const my $ALN_TO_SORTED => q{ sampe -P -a 1000 -r '%s' %s %s_1.sai %s_2.sai %s.%s %s.%s | %s fixmate=1 inputformat=sam level=1 tmpfile=%s_tmp O=%s_sorted.bam};
const my $BAMSORT => q{ fixmate=1 inputformat=sam level=1 tmpfile=%s_tmp O=%s_sorted.bam inputthreads=%s outputthreads=%s calmdnm=1 calmdnmrecompindetonly=1 calmdnmreference=%s};

const my $FALSE_RG => q{@RG\tID:%s\tSM:%s\tLB:default\tPL:ILLUMINA};

const my $READPAIR_SPLITSIZE => 10,
const my $PAIRED_FQ_LINE_MULT => 4;
const my $INTERLEAVED_FQ_LINE_MULT => 8;
Expand Down Expand Up @@ -99,7 +101,7 @@ sub mem_mapmax {
opendir(my $dh, $folder);
while(my $file = readdir $dh) {
next if($file =~ m/^\./);
next if($file =~ m/^2\.[[:digit]]+/); # captured by 1.*
next if($file =~ m/^pairedfq2\.[[:digit:]]+/); # captured by 1.*
push @files, File::Spec->catfile($folder, $file);
}
closedir($dh);
Expand Down Expand Up @@ -137,32 +139,42 @@ sub split_in {
if($input->fastq) {
# paired fq input
if($input->paired_fq) {
push @commands, sprintf 'split -a 3 -d -l %s %s %s.'
, $fragment_size * $MILLION * $PAIRED_FQ_LINE_MULT
, $input->in.'_1.'.$input->fastq
, File::Spec->catfile($split_folder, '1');
push @commands, sprintf 'split -a 3 -d -l %s %s %s.'
, $fragment_size * $MILLION * $PAIRED_FQ_LINE_MULT
, $input->in.'_2.'.$input->fastq
, File::Spec->catfile($split_folder, '2');
my $fq1 = $input->in.'_1.'.$input->fastq;
my $fq2 = $input->in.'_2.'.$input->fastq;
if($input->fastq =~ m/[.]gz$/) {
symlink $fq1, File::Spec->catfile($split_folder, 'pairedfq1.0.'.$input->fastq);
symlink $fq2, File::Spec->catfile($split_folder, 'pairedfq2.0.'.$input->fastq);
}
else {
push @commands, sprintf 'split -a 3 -d -l %s %s %s.'
, $fragment_size * $MILLION * $PAIRED_FQ_LINE_MULT
, $fq1
, File::Spec->catfile($split_folder, 'pairedfq1');
push @commands, sprintf 'split -a 3 -d -l %s %s %s.'
, $fragment_size * $MILLION * $PAIRED_FQ_LINE_MULT
, $fq2
, File::Spec->catfile($split_folder, 'pairedfq2');
}
}
# interleaved FQ
else {
push @commands, sprintf 'split -a 3 -d -l %s %s %s.'
, $fragment_size * $MILLION * $INTERLEAVED_FQ_LINE_MULT
, $input->in.'.'.$input->fastq
, File::Spec->catfile($split_folder, 'i');
my $fq_i = $input->in.'.'.$input->fastq;
if($input->fastq =~ m/[.]gz$/) {
symlink $fq_i, File::Spec->catfile($split_folder, 'i.'.$input->fastq);
}
else {
push @commands, sprintf 'split -a 3 -d -l %s %s %s.'
, $fragment_size * $MILLION * $INTERLEAVED_FQ_LINE_MULT
, $fq_i
, File::Spec->catfile($split_folder, 'i');
}
}
}
# if bam input
else {
my $bam2fq = which('bamtofastq') || die "Unable to find 'bamtofastq' in path";
$bam2fq .= sprintf $BAMFASTQ, File::Spec->catfile($tmp, "bamtofastq.$index"),
File::Spec->catfile($tmp, "bamtofastq.$index.s"),
File::Spec->catfile($tmp, "bamtofastq.$index.o1"),
File::Spec->catfile($tmp, "bamtofastq.$index.o2"),
File::Spec->catfile($split_folder, 'i'),
File::Spec->catfile($split_folder, 'i'),
$split_folder,
$input->in,
$fragment_size * $MILLION * $BAM_MULT;
# treat as interleaved fastq
Expand Down Expand Up @@ -204,7 +216,11 @@ sub bwa_mem {
$rg_line = q{'}.$input->rg_header(q{\t}).q{'};
}
else {
($rg_line, undef) = PCAP::Bam::rg_line_for_output($input->in, $options->{'sample'});
my ($rg) = $split =~ m|/split/[[:digit:]]+/(.+)_i.fq_[[:digit:]]+.gz$|;
($rg_line, undef) = PCAP::Bam::rg_line_for_output($input->in, $options->{'sample'}, undef, $rg);
unless($rg_line) {
$rg_line = sprintf $FALSE_RG, $split_element, $options->{'sample'};
}
$rg_line = q{'}.$rg_line.q{'};
}

Expand All @@ -225,7 +241,7 @@ sub bwa_mem {
# uncoverable branch false
if($input->paired_fq) {
my $split2 = $split;
$split2 =~ s/1(\.[[:digit:]]+)$/2$1/;
$split2 =~ s/pairedfq1(\.[[:digit:]]+)/pairedfq2$1/;
$bwa .= ' '.$split;
$bwa .= ' '.$split2;
}
Expand Down

0 comments on commit ec9801d

Please sign in to comment.