Skip to content

Commit

Permalink
Merge pull request #56 from ICGC-TCGA-PanCancer/dev
Browse files Browse the repository at this point in the history
Resolves #54
  • Loading branch information
keiranmraine committed Dec 15, 2014
2 parents d4e4259 + a566124 commit 7cb9d0a
Show file tree
Hide file tree
Showing 8 changed files with 98 additions and 33 deletions.
1 change: 1 addition & 0 deletions MANIFEST
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ t/data/no_readgroups.bam
t/data/no_readgroups.sam
t/data/not_really_a.bam
t/data/paired.bam
t/data/reconcile_bas.bam
t/data/Stats.bam
t/data/Stats.bam.bas
t/data/test.bam.bas
Expand Down
4 changes: 2 additions & 2 deletions MYMETA.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"unknown"
],
"dynamic_config" : 0,
"generated_by" : "ExtUtils::MakeMaker version 6.68, CPAN::Meta::Converter version 2.131560",
"generated_by" : "ExtUtils::MakeMaker version 6.68, CPAN::Meta::Converter version 2.142690",
"license" : [
"gpl_2"
],
Expand Down Expand Up @@ -55,5 +55,5 @@
}
},
"release_status" : "stable",
"version" : "v1.2.1"
"version" : "v1.2.3"
}
48 changes: 24 additions & 24 deletions MYMETA.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,38 +3,38 @@ abstract: unknown
author:
- unknown
build_requires:
ExtUtils::MakeMaker: 0
ExtUtils::MakeMaker: '0'
configure_requires:
ExtUtils::MakeMaker: 0
ExtUtils::MakeMaker: '0'
dynamic_config: 0
generated_by: 'ExtUtils::MakeMaker version 6.68, CPAN::Meta::Converter version 2.131560'
generated_by: 'ExtUtils::MakeMaker version 6.68, CPAN::Meta::Converter version 2.142690'
license: gpl
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.4.html
version: 1.4
version: '1.4'
name: PCAP
no_index:
directory:
- t
- inc
requires:
Bio::DB::Sam: 1.39
Bio::Root::Version: 1.006923
Capture::Tiny: 0.24
Const::Fast: 0.014
Data::UUID: 1.219
Devel::Cover: 1.09
File::Which: 0.05
GD: 2.52
IPC::System::Simple: 1.25
List::Util: 1.38
Math::Gradient: 0.04
Module::Build: 0.42
Pod::Coverage: 0.23
Proc::ProcessTable: 0.5
Sub::Exporter::Progressive: 0.001011
Term::UI: 0.42
Test::Fatal: 0.013
Try::Tiny: 0.19
XML::Simple: 2.2
version: v1.2.1
Bio::DB::Sam: '1.39'
Bio::Root::Version: '1.006923'
Capture::Tiny: '0.24'
Const::Fast: '0.014'
Data::UUID: '1.219'
Devel::Cover: '1.09'
File::Which: '0.05'
GD: '2.52'
IPC::System::Simple: '1.25'
List::Util: '1.38'
Math::Gradient: '0.04'
Module::Build: '0.42'
Pod::Coverage: '0.23'
Proc::ProcessTable: '0.5'
Sub::Exporter::Progressive: '0.001011'
Term::UI: '0.42'
Test::Fatal: '0.013'
Try::Tiny: '0.19'
XML::Simple: '2.2'
version: v1.2.3
70 changes: 66 additions & 4 deletions bin/xml_to_bas.pl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
use autodie qw(:all);
use Getopt::Long;
use Pod::Usage qw(pod2usage);
use Bio::DB::Sam;

my $options = &setup;
xml_to_bas($options);
Expand All @@ -26,21 +27,22 @@ sub xml_to_bas {
, KeyAttr => [],);

my $bas_json = find_bas_json($document);
json_to_bas_file($bas_json, $options->{'output'});
json_to_bas_file($bas_json, $options);
return 1;
}

sub json_to_bas_file {
my ($bas_json, $out_path) = @_;
my ($bas_json, $options) = @_;

my $bas_data = decode_json $bas_json;
validate_bas($bas_data, $options);
my @metrics = @{$bas_data->{'qc_metrics'}};

my @columns = bas_columns($metrics[0]);

my $OUT;
if(defined $out_path) {
open $OUT, '>', $out_path;
if(defined $options->{'output'}) {
open $OUT, '>', $options->{'output'};
}
else {
$OUT = *STDOUT;
Expand All @@ -66,6 +68,59 @@ sub bas_columns {
return @columns;
}

sub validate_bas {
my ($bas_data, $options) = @_;
# first look for read_group_id clash
my %ids;
my $clash = clash_check($bas_data);
if($clash) {
unless($options->{'bam'}) {
die "ERROR: multiple metric entries with the same read_group_id. May be recoverable with '-b' defined.\n";
}
else {
warn "WARNING: multiple metric entries with the same read_group_id attempting to compensate...\n";
correct_clash($bas_data, $options->{'bam'});
}
}
}

sub correct_clash {
my ($bas_data, $bamfile) = @_;
my $bam = Bio::DB::Sam->new(-bam => $options->{'bam'});
my %rg_by_pu;
foreach my $hl(split /\n/, $bam->header->text) {
next unless($hl =~ m/^\@RG/);
my ($pu) = $hl =~ m/\tPU:([^\t]+)/;
my ($id) = $hl =~ m/\tID:([^\t]+)/;
die "ERROR: Unable to recover read_group_id clash using PU field.\n" if(exists $rg_by_pu{$pu});
$rg_by_pu{$pu} = $id;
}

my @metrics = @{$bas_data->{'qc_metrics'}};
for my $row(@metrics) {
$row->{'read_group_id'} = $rg_by_pu{ $row->{'metrics'}->{'platform_unit'} };
$row->{'metrics'}->{'readgroup'} = $rg_by_pu{ $row->{'metrics'}->{'platform_unit'} };
}
die "ERROR: Unable to recover read_group_id clash using PU field.\n" if(clash_check($bas_data));
return 1;
}

sub clash_check {
my $bas_data = shift;
my @metrics = @{$bas_data->{'qc_metrics'}};
my %ids;
my $clash = 0;
for my $row(@metrics) {
if(exists $ids{$row->{'read_group_id'}}) {
$clash++;
}
else {
$ids{$row->{'read_group_id'}} = 1;
}
}
return $clash;
}


sub find_bas_json {
my $document = shift;
Expand Down Expand Up @@ -97,6 +152,7 @@ sub setup{
'm|man' => \$opts{'m'},
'v|version' => \$opts{'v'},
'd|uri=s' => \$opts{'uri'},
'b|bam=s' => \$opts{'bam'},
'o|output=s' => \$opts{'output'},
'<>' => sub{push(@random_args,shift(@_));}
) or pod2usage(2);
Expand Down Expand Up @@ -131,6 +187,12 @@ =head1 SYNOPSIS
-uri -d Same URI used by gtdownload
-output -o Name for output file. Defaults to STDOUT.
Optional parameters:
-bam -b BAM file this data relates to
- checks retrieved data correlates with expected BAM
- additionally can correct read_group_id if other fields correlate when
clashes occur.
Other:
-help -h Brief help message.
-man -m Full documentation.
Expand Down
Binary file modified docs.tar.gz
Binary file not shown.
3 changes: 2 additions & 1 deletion lib/PCAP.pm
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use strict;
use Const::Fast qw(const);
use base 'Exporter';

our $VERSION = '1.2.2';
our $VERSION = '1.2.3';
our @EXPORT = qw($VERSION);

const my $LICENSE =>
Expand All @@ -51,6 +51,7 @@ const my %UPGRADE_PATH => ( '0.1.0' => 'biobambam,bwa,samtools',
'1.1.2' => 'biobambam,bwa,samtools',
'1.2.0' => '', # if later versions have new versions then all preceding need that tool listing
'1.2.1' => '',
'1.2.2' => '',
);

sub license {
Expand Down
5 changes: 3 additions & 2 deletions lib/PCAP/Bwa.pm
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ const my $BAMSORT => q{ inputformat=sam level=1 tmpfile=%s_tmp O=%s_sorted.bam i
const my $READPAIR_SPLITSIZE => 10,
const my $PAIRED_FQ_LINE_MULT => 4;
const my $INTERLEAVED_FQ_LINE_MULT => 8;
const my $BAM_MULT => 2;
const my $MILLION => 1_000_000;

const my $BWA_MEM_MAX_CORES => 6;
Expand Down Expand Up @@ -163,7 +164,7 @@ sub split_in {
File::Spec->catfile($split_folder, 'i'),
File::Spec->catfile($split_folder, 'i'),
$input->in,
$fragment_size * $MILLION;
$fragment_size * $MILLION * $BAM_MULT;
# treat as interleaved fastq
push @commands, $bam2fq;
}
Expand Down Expand Up @@ -224,7 +225,7 @@ sub bwa_mem {
# uncoverable branch false
if($input->paired_fq) {
my $split2 = $split;
$split2 =~ s/1(\.[[:digit:]]+)$/2$2/;
$split2 =~ s/1(\.[[:digit:]]+)$/2$1/;
$bwa .= ' '.$split;
$bwa .= ' '.$split2;
}
Expand Down
Binary file added t/data/reconcile_bas.bam
Binary file not shown.

0 comments on commit 7cb9d0a

Please sign in to comment.