diff --git a/perl/MANIFEST b/perl/MANIFEST index 29e4466..eba55be 100644 --- a/perl/MANIFEST +++ b/perl/MANIFEST @@ -4,6 +4,7 @@ bin/brassI_filter.pl bin/brassI_np_in.pl bin/brassI_pre_filter.pl bin/brassI_prep_bam.pl +bin/combineResults.pl bin/findbp bin/findbp-all bin/make-repeat-file @@ -21,6 +22,7 @@ lib/Graph/Reader/Velvet.pm lib/Sanger/CGP/Brass/Implement.pm lib/Sanger/CGP/BrassFilter/BlatFlag.pm lib/Sanger/CGP/BrassFilter/BrassMarkedGroups.pm +lib/Sanger/CGP/BrassFilter/Cleanup.pm lib/Sanger/CGP/BrassFilter/CnFlag.pm lib/Sanger/CGP/BrassFilter/OccursFlag.pm lib/Sanger/CGP/BrassFilter/TransFlag.pm diff --git a/perl/MYMETA.json b/perl/MYMETA.json index edc170a..83b2e51 100644 --- a/perl/MYMETA.json +++ b/perl/MYMETA.json @@ -43,5 +43,5 @@ } }, "release_status" : "stable", - "version" : "v2.2.0" + "version" : "v3.0.0" } diff --git a/perl/MYMETA.yml b/perl/MYMETA.yml index 7e754e1..7b60b28 100644 --- a/perl/MYMETA.yml +++ b/perl/MYMETA.yml @@ -25,4 +25,4 @@ requires: Pod::Coverage: '0.23' Template: '2.26' Test::Fatal: '0.013' -version: v2.2.0 +version: v3.0.0 diff --git a/perl/Makefile.PL b/perl/Makefile.PL index efac3c1..549fd1c 100755 --- a/perl/Makefile.PL +++ b/perl/Makefile.PL @@ -45,6 +45,7 @@ WriteMakefile( bin/brassI_pre_filter.pl bin/brassI_prep_bam.pl bin/brass.pl + bin/combineResults.pl bin/findbp bin/findbp-all bin/make-repeat-file diff --git a/perl/bin/brass.pl b/perl/bin/brass.pl index c2836b0..a7137bc 100755 --- a/perl/bin/brass.pl +++ b/perl/bin/brass.pl @@ -97,14 +97,20 @@ sub cleanup { my $options = shift; my $tmpdir = $options->{'tmp'}; - system("cp $tmpdir/*.brm.bam $options->{outdir}/."); - move(File::Spec->catdir($tmpdir, 'logs'), File::Spec->catdir($options->{'outdir'}, 'logs')) || die $!; my $basefile = File::Spec->catfile($options->{'outdir'}, $options->{'safe_tumour_name'}.'_vs_'.$options->{'safe_normal_name'}); unlink "$basefile.groups"; + unlink "$basefile.groups.filtered.bedpe"; unlink "$basefile.assembled.bedpe"; + unlink $basefile.'_ann.assembled.vcf'; + unlink $basefile.'_ann.assembled.bedpe'; unlink "$basefile.annot.vcf"; unlink "$basefile.annot.vcf.srt"; + unlink $basefile.'_ann.groups.filtered.vcf'; + unlink $basefile.'_ann.groups.filtered.bedpe'; + + system("cp $tmpdir/*.brm.bam $options->{outdir}/."); + move(File::Spec->catdir($tmpdir, 'logs'), File::Spec->catdir($options->{'outdir'}, 'logs')) || die $!; remove_tree $tmpdir if(-e $tmpdir); return 0; diff --git a/perl/bin/brassI_filter.pl b/perl/bin/brassI_filter.pl index cf1d085..73fc3bb 100755 --- a/perl/bin/brassI_filter.pl +++ b/perl/bin/brassI_filter.pl @@ -39,12 +39,14 @@ use strict; use warnings FATAL => 'all'; +use File::Copy qw(move); use Sanger::CGP::BrassFilter::BrassMarkedGroups; use Sanger::CGP::BrassFilter::TransFlag; use Sanger::CGP::BrassFilter::OccursFlag; use Sanger::CGP::BrassFilter::CnFlag; use Sanger::CGP::BrassFilter::BlatFlag; +use Sanger::CGP::BrassFilter::Cleanup; use File::Which; use Getopt::Long; @@ -163,7 +165,10 @@ if ($rblat_only) { $do_process = 0; $do_trans = 0; $do_occurrences = 0; $do_copynumber = 0; $do_range_blat = 1; } # process core information and print to outfile -if ($do_process) { process($infile, $outfile, $tumour); } +if ($do_process) { + process($infile, $outfile, $tumour); + cleanup($outfile); +} # check for translocations if ($do_trans) { update_translocations($outfile, $bal_trans_field, $inv_field, $bal_distance, $inv_distance); } @@ -177,7 +182,6 @@ # check for L v H range blat scores if ($do_range_blat) { update_blat($outfile, $blat_field, $blat_script, $ref, $minIdentity); } -#-----------------------------------------------------------------------------# #-----------------------------------------------------------------------------# sub process { my ($infile, $outfile, $tumour) = @_; @@ -205,6 +209,19 @@ sub process { print "finished printing to $outfile at $date"; } } + +#-----------------------------------------------------------------------------# +sub cleanup { + my ($outfile) = @_; + $outfile = "$outfile.bedpe" unless($outfile =~ m/\.bedpe$/); + my $new_out = "$outfile.tmp"; + + my $cleanup = Sanger::CGP::BrassFilter::Cleanup->new(-infile => $outfile, + -outfile => $new_out); + $cleanup->process(); + move($new_out, $outfile) || die $!; +} + #------------------------------------------------------------------------------------------------# sub update_translocations { diff --git a/perl/bin/combineResults.pl b/perl/bin/combineResults.pl new file mode 100755 index 0000000..8d1893e --- /dev/null +++ b/perl/bin/combineResults.pl @@ -0,0 +1,210 @@ +#!/usr/bin/perl + +use strict; +use warnings FATAL => 'all'; +use autodie; +use Const::Fast qw(const); + +const my @NEW_BEDPE_HEADER => ('# chr1','start1','end1','chr2','start2','end2','id/name','brass_score','assembly_score','strand1','strand2','sample','readpair names','readpair count','bal_trans','inv','occL','occH','copynumber_flag','range_blat','Brass Notation','non-template','micro-homology','assembled readnames','assembled read count','gene','gene_id','transcript_id','strand','end_phase','region','region_number','total_region_count','first/last','gene','gene_id','transcript_id','strand','phase','region','region_number','total_region_count','first/last','fusion_flag'); + +if(scalar @ARGV != 3) { + warn "USAGE: combineResults.pl X_ann.groups X_ann.assembled X.final\n"; + warn "\tX.final is a prefix, relevant suffixes will be added for VCF and BEDPE outputs\n"; + exit 1; +} +my ($groups_prefix, $assembled_prefix, $final_prefix) = @ARGV; + + +mergeVcf($groups_prefix, $assembled_prefix, $final_prefix); +mergeBedpe($groups_prefix, $assembled_prefix, $final_prefix); + +sub mergeVcf { + my ($groups_prefix, $assembled_prefix, $final_prefix) = @_; + + my $assembled_vcf = "$assembled_prefix.vcf"; + my $phaseI_vcf = "$groups_prefix.vcf"; + + my $final_vcf = "$final_prefix.vcf"; + + my %orig_data; + open my $ORIG, '<', $assembled_vcf; + while(my $line = <$ORIG>) { + next if($line =~ m/^#/); + chomp $line; + my @bits = split /\t/, $line; + my $id = $bits[2]; + $orig_data{$id} = \@bits; + } + close $ORIG; + + my $new_info = q{##INFO=} + .qq{\n} + .q{##INFO=}; + my $new_format = q{##FORMAT=}; + + my $info_found = 0; + my $format_found = 0; + + open my $FINAL, '>', $final_vcf; + open my $FIXED, '<', $phaseI_vcf; + + my @ends; + + while(my $line = <$FIXED>) { + if(@ends == 2) { + svclass(\@ends); + print $FINAL join("\t",@{$ends[0]}), "\n"; + print $FINAL join("\t",@{$ends[1]}), "\n"; + @ends = (); + } + if($line =~ m/^#/) { + if($info_found == 0 && $line =~ m/^##INFO/) { + print $FINAL $new_info,"\n"; + $info_found = 1; + } + if($format_found == 0 && $line =~ m/^##FORMAT/) { + print $FINAL $new_format,"\n"; + $format_found = 1; + } + print $FINAL $line; + next; + } + chomp $line; + my @bits = split /\t/, $line; + my $id = $bits[2]; + if(!exists $orig_data{$id}) { + $bits[-3] .= ':PS'; + $bits[-2] = '0:'.$bits[-2]; # normal is always 0:0 + $bits[-1] = '0:'.$bits[-1]; + + push @ends, \@bits; + next; + } + else { + my $old_info = $orig_data{$id}->[7]; + my ($tsrds) = $bits[7] =~ m/(TSRDS=[^;]+)/; + + $bits[7] = $old_info.';'.$tsrds; + $bits[7] .= ';BAS='.$orig_data{$id}->[5]; + $bits[5] = q{.}; + $bits[-3] .= ':PS'; + $bits[-2] = '0:'.$bits[-2]; # normal is always 0:0 + $bits[-1] = $orig_data{$id}->[-1].':'.$bits[-1]; + + $bits[7] =~ s/;;/;/g; + + push @ends, \@bits; + next; + } + } + if(@ends == 2) { + svclass(\@ends); + print $FINAL join("\t",@{$ends[0]}), "\n"; + print $FINAL join("\t",@{$ends[1]}), "\n"; + } + + close $FIXED; + close $FINAL; +} + +sub mergeBedpe { + my ($groups_prefix, $assembled_prefix, $final_prefix) = @_; + my $assembled_bedpe = "$assembled_prefix.bedpe"; + my $phaseI_bedpe= "$groups_prefix.bedpe"; + my $final_bedpe = "$final_prefix.bedpe"; + + my %orig_data; + open my $ORIG, '<', $assembled_bedpe; + while(my $line = <$ORIG>) { + next if($line =~ m/^#/); + chomp $line; + my @bits = split /\t/, $line; + my $id = $bits[6]; + $orig_data{$id} = \@bits; + } + close $ORIG; + + open my $FINAL, '>', $final_bedpe; + open my $FIXED, '<', $phaseI_bedpe; + + print $FINAL join("\t", @NEW_BEDPE_HEADER),"\n"; + + while(my $line = <$FIXED>) { + next if($line =~ m/^#/); + chomp $line; + my @bits = split /\t/, $line; + my $id = $bits[6]; + if(index($id, ',') != -1) { + my @ids = split /,/, $id; + my @multi_record; + for(@ids) { + push @multi_record, $orig_data{$id} if(exists $orig_data{$id}); + } + if(scalar @multi_record) { + warn "MULTI\n"; + warn "$line\n"; + warn Dumper(\@multi_record); + } + die "merged events correlate with assembled results, don't know what to do"; + } + elsif(!exists $orig_data{$id}) { + my @new; + push @new, @bits[0..7]; # 1-8 + push @new, q{_}; # 9 + push @new, @bits[8..9]; # 10-11 + push @new, $bits[16]; # 12 + push @new, @bits[18..25]; # 13-20 + push @new, q{_},q{_},q{_},q{_},q{_}; # 21-25 + push @new, @bits[26..44]; # 26-44 + + print $FINAL join("\t", @new),"\n"; + next; + } + else { + my @old_brass_II = @{$orig_data{$id}}; + my @new; + push @new, @old_brass_II[0..6]; # 1-7 + push @new, q{_}; # 8 + push @new, @old_brass_II[7..10]; # 9-12 + push @new, @bits[18..25]; # 13-20 + push @new, @old_brass_II[11..14]; # 21-24 + push @new, scalar (split /,/, $new[-1]); # 25 # assembled read count + push @new, @bits[26..44]; # 26-44 + + print $FINAL join("\t", @new),"\n"; + next; + } + } + + + close $FIXED; + close $FINAL; + + return; +} + + +sub svclass { + my ($end_a, $end_b) = @{$_[0]}; + die "Record IDs out of sync at:\n",join("\t",@{$end_a}),"\n",join("\t",@{$end_b}),"\n" if($end_a->[2] !~ m/_[12]$/ || $end_b->[2] !~ m/_[12]$/); + my $class; + if($end_a->[4] =~ m/^[[:upper:]]\[/) { + # ++ + $class = 'deletion'; + } + elsif($end_a->[4] =~ m/^[[:upper:]]\]/ || $end_a->[4] =~ m/\[[[:upper:]]$/) { + # +- / -+ + $class = 'inversion'; + } + elsif($end_a->[4] =~ m/\][[:upper:]]$/) { + # -- + $class = 'tandem-duplication'; + } + else { + die "Unknown rearrangement syntax: $end_a->[4]\n"; + } + $end_a->[7] .= ';SVCLASS='.$class; + $end_b->[7] .= ';SVCLASS='.$class; + return; +} + diff --git a/perl/docs.tar.gz b/perl/docs.tar.gz index 0608114..aa3d268 100644 Binary files a/perl/docs.tar.gz and b/perl/docs.tar.gz differ diff --git a/perl/lib/Bio/Brass.pm b/perl/lib/Bio/Brass.pm index 7950a4e..aad39b1 100644 --- a/perl/lib/Bio/Brass.pm +++ b/perl/lib/Bio/Brass.pm @@ -46,7 +46,7 @@ our @EXPORT = qw(find_breakpoints find_dusty_vertices is_dusty get_isolated_bp_alignment get_isolated_bp_surrounding_region $VERSION); -our $VERSION = '2.2.0'; +our $VERSION = '3.0.0'; =head1 NAME diff --git a/perl/lib/Sanger/CGP/Brass/Implement.pm b/perl/lib/Sanger/CGP/Brass/Implement.pm index 739eb4f..cadd01f 100644 --- a/perl/lib/Sanger/CGP/Brass/Implement.pm +++ b/perl/lib/Sanger/CGP/Brass/Implement.pm @@ -248,11 +248,36 @@ sub grass { return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); my $assembled = File::Spec->catfile($options->{'outdir'}, $options->{'safe_tumour_name'}.'_vs_'.$options->{'safe_normal_name'}.'.assembled.bedpe'); + my $groups = File::Spec->catfile($options->{'outdir'}, $options->{'safe_tumour_name'}.'_vs_'.$options->{'safe_normal_name'}.'.groups.filtered.bedpe'); my $merge = sprintf '(cat %s/assemble/bedpe.* | sort -k1,1 -k 2,2n > %s)', $tmp, $assembled; my $tumour = File::Spec->catfile($tmp, $options->{'safe_tumour_name'}).'.brm.bam'; my $normal = File::Spec->catfile($tmp, $options->{'safe_normal_name'}).'.brm.bam'; + my $annot_phaseI_prefix = File::Spec->catfile($options->{'outdir'}, $options->{'safe_tumour_name'}.'_vs_'.$options->{'safe_normal_name'}.'_ann.groups.filtered'); + my $annot_phaseII_prefix = File::Spec->catfile($options->{'outdir'}, $options->{'safe_tumour_name'}.'_vs_'.$options->{'safe_normal_name'}.'_ann.assembled'); + + my $final = File::Spec->catfile($options->{'outdir'}, $options->{'safe_tumour_name'}.'_vs_'.$options->{'safe_normal_name'}.'.annot'); + + my $combine_cmd = "$^X "; + $combine_cmd .= _which('combineResults.pl'); + $combine_cmd .= ' '.$annot_phaseI_prefix; + $combine_cmd .= ' '.$annot_phaseII_prefix; + $combine_cmd .= ' '.$final; + + PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), + [ $merge, + _grass($options, $assembled), + _grass($options, $groups), + $combine_cmd], + 0); + + PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); + return 1; +} + +sub _grass { + my ($options, $input) = @_; my $grass_cmd = "$^X "; $grass_cmd .= _which('grass.pl'); $grass_cmd .= sprintf $GRASS, $options->{'g_cache'}, @@ -263,19 +288,8 @@ sub grass { $options->{'protocol'}, $options->{'tumour_name'}, $options->{'normal_name'}, - $assembled; - - PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), [$merge, $grass_cmd], 0); - - my $munged_name =File::Spec->catfile($options->{'outdir'}, $options->{'safe_tumour_name'}.'_vs_'.$options->{'safe_normal_name'}.'_ann.assembled.vcf'); - my $annotated = File::Spec->catfile($options->{'outdir'}, $options->{'safe_tumour_name'}.'_vs_'.$options->{'safe_normal_name'}.'.annot.vcf'); - move $munged_name, $annotated || die $!; - $munged_name =~ s/vcf$/bedpe/; - $annotated =~ s/vcf$/bedpe/; - move $munged_name, $annotated || die $!; - - PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); - return 1; + $input; + return $grass_cmd; } sub split_count { diff --git a/perl/lib/Sanger/CGP/BrassFilter/Cleanup.pm b/perl/lib/Sanger/CGP/BrassFilter/Cleanup.pm new file mode 100644 index 0000000..239fb1c --- /dev/null +++ b/perl/lib/Sanger/CGP/BrassFilter/Cleanup.pm @@ -0,0 +1,401 @@ +package Sanger::CGP::BrassFilter::Cleanup; + +########## LICENCE ########## +# Copyright (c) 2014,2015 Genome Research Ltd. +# +# Author: Cancer Genome Project +# +# This file is part of BRASS. +# +# BRASS is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’." +########## LICENCE ########## + +use strict; +use autodie qw(:all); +use warnings FATAL => 'all'; +use List::Util qw(min max sum); +use Data::Dumper; + +use Bio::Brass; +our $VERSION = Bio::Brass->VERSION; + +use Const::Fast qw(const); + +const my %DISPATCH => ('chr1' => \&die_if_diff, + 'start1' => \&l_min, + 'end1' => \&l_max, + 'start2' => \&l_min, + 'chr2' => \&die_if_diff, + 'end2' => \&l_max, + 'id/name' => \&l_join, + 'brass_score' => \&l_read_count, + 'strand1' => \&die_if_diff, + 'strand2' => \&die_if_diff, + 'repeats' => \&merge_repeats, + 'np_sample_count' => \&l_sum, + 'tumour_count' => \&l_read_count, + 'normal_count' => \&l_sum, + 'np_count' => \&l_sum, + 'bkpt_distance' => \&correct_bp, + 'sample' => \&uniq_list, + 'sample_type' => \&uniq_list, + 'names' => \&uniq_list, + 'count' => \&sum_counts, + 'bal_trans' => \&die_if_diff, + 'inv' => \&l_join, + 'occL' => \&no_op, + 'occH' => \&no_op, + 'copynumber_flag' => \&die_if_diff, + 'range_blat' => \&l_join,); + +sub new { + my ($class, %args) = @_; + my $self = {}; + bless $self,$class; + + $self->{'infile'} = $args{-infile} if(defined $args{-infile}); + $self->{'outfile'} = $args{-outfile} if(defined $args{-outfile}); + + return $self; +} + +sub process { + my $self = shift; + my @headers; + + my $in_file = $self->{'infile'}; + my $out_file = $self->{'outfile'}; + + open my $IN, '<', $in_file; + my $last_header; + my (%loc_by_name, %name_by_loc); + my @header_keys; + my %counts; + my %end_count; + my @pre_clean; + + while(my $line = <$IN>) { + if($line =~ m/^#/) { + chomp $line; + push @headers, $line; + next; + } + unless(defined $last_header) { + $last_header = $headers[-1]; + my @lookups = header_map($last_header); + %loc_by_name = %{$lookups[0]}; + %name_by_loc = %{$lookups[1]}; + @header_keys = @{$lookups[2]}; + } + chomp $line; + $counts{'Groups Total'}++; + my %group; + my @elements = split /\t/, $line; + for my $i(0..((scalar @elements)-1)) { + my $name = $name_by_loc{$i}; + $elements[$i] =~ s/\s+//g if($name eq 'repeats'); + $group{ $name_by_loc{$i} } = $elements[$i]; + } + $group{'_orig'} = $line; + push @pre_clean, \%group; + $end_count{"$group{chr1}:$group{start1}-$group{end1}:$group{strand1}"}++; + $end_count{"$group{chr2}:$group{start2}-$group{end2}:$group{strand2}"}++; + #push @{$chr_strand_grps{join(':',@elements[0..5],@elements[8..9])}}, $groups[-1]; + } + close $IN; + +warn 'loaded: '.$counts{'Groups Total'}; + + my $grps_before_clean = scalar @pre_clean; + my $end_occ_count = 0; + my %chr_strand_grps; + while(my $group = shift @pre_clean) { + next if($end_count{ "$group->{chr1}:$group->{start1}-$group->{end1}:$group->{strand1}" } > 10); + next if($end_count{ "$group->{chr2}:$group->{start2}-$group->{end2}:$group->{strand2}" } > 10); + push @{$chr_strand_grps{join(':',$group->{chr1}, $group->{chr2}, $group->{strand1}, $group->{strand2})}}, $group; + $end_occ_count++; + } + + $counts{'Discard - EndOcc'} = $counts{'Groups Total'} - $end_occ_count; + + #@groups = @{merge_groups(\@groups)}; + my @groups; + for my $key(keys %chr_strand_grps) { + warn "Merging groups from set: $key (".(scalar @{$chr_strand_grps{$key}}).")\n"; + push @groups, @{merge_groups( $chr_strand_grps{$key} )}; + } + $counts{'Discard - merged'} = $end_occ_count - scalar @groups; + + @groups = @{filter_groups(\@groups, \%counts)}; + + $counts{'Groups Kept'} = scalar @groups; + + # now fix the occL/H values + occX(\@groups); + + my $col_max = (scalar (keys %name_by_loc))-1; + open my $OUT, '>', $out_file; + print $OUT join("\n", @headers), "\n"; + for my $group(@groups) { + for my $i(0..$col_max) { + print $OUT $group->{ $name_by_loc{$i} }; + print $OUT "\t" if($i != $col_max); + } + print $OUT "\n"; + } + close $OUT; + + for(sort keys %counts) { + warn "$_:\t$counts{$_}\n"; + } +} + +sub occX { + my $groups = shift; + for my $this(@{$groups}) { + my ($low, $high) = (0,0); + for my $that(@{$groups}) { + # I count myself too + $low++ if(($this->{'start1'}-500) <= $that->{'end1'} && ($this->{'end1'}+500) >= $that->{'start1'}); + $high++ if(($this->{'start2'}-500) <= $that->{'end2'} && ($this->{'end2'}+500) >= $that->{'start2'}); + } + $this->{'occL'} = $low; + $this->{'occH'} = $high; + } +} + +sub filter_groups { + my ($groups, $counts) = @_; + my @new_groups; + while(my $group = shift @{$groups}) { + if(discard_inv($group) == 1) { + $counts->{'Discard - Small Inv'}++; + next; + } + if(discard_smallDel($group) == 1) { + $counts->{'Discard - Small del'}++; + next; + } + push @new_groups, $group; + } + return \@new_groups; +} + +sub merge_groups { + my ($groups, $header_keys) = @_; + my @new_groups; + while (my $this = shift @{$groups}) { + # munge the coords for this event if not done in usage + unless(exists $this->{'_start1'}) { + $this->{'_start1'} = $this->{'start1'}-250; + $this->{'_start2'} = $this->{'start2'}-250; + $this->{'_end1'} = $this->{'end1'}+250; + $this->{'_end2'} = $this->{'end2'}+250; + } + my $merged = 0; + for my $that(@{$groups}) { + # will only ever be called once per group as modifying the primary reference + unless(exists $that->{'_start1'}) { + $that->{'_start1'} = $that->{'start1'}-250; + $that->{'_start2'} = $that->{'start2'}-250; + $that->{'_end1'} = $that->{'end1'}+250; + $that->{'_end2'} = $that->{'end2'}+250; + } + + my $a_s1 = $this->{'_start1'}; + my $a_e1 = $this->{'_end1'}; + my $a_s2 = $this->{'_start2'}; + my $a_e2 = $this->{'_end2'}; + + my $b_s1 = $that->{'_start1'}; + my $b_e1 = $that->{'_end1'}; + my $b_s2 = $that->{'_start2'}; + my $b_e2 = $that->{'_end2'}; + + if(($a_s1 <= $b_e1 && $a_e1 >= $b_s1) && ($a_s2 <= $b_e2 && $a_e2 >= $b_s2)) { + merge_event($this, $that, $header_keys); + $merged = 1; + # clear these as they will now need a different range + delete $that->{'_start1'}; + delete $that->{'_start2'}; + delete $that->{'_end1'}; + delete $that->{'_end2'}; + last; # can only merge once in a loop + } + } + + push @new_groups, $this unless($merged); + } + return \@new_groups; +} + +sub discard_smallDel { + my $group = shift; + # must be same chr, different strand to filter + return 0 if($group->{'chr1'} ne $group->{'chr2'}); + return 0 if($group->{'strand1'} ne '+' || $group->{'strand2'} ne '-'); + my $dist = $group->{'start2'} - $group->{'end1'}; + return 1 if($dist < 500); + return 0; +} + +sub discard_inv { + my $group = shift; + # is it an inversion + return 0 if(($group->{'chr1'} ne $group->{'chr2'}) || ($group->{'strand1'} ne $group->{'strand2'})); + + my $t_count = $group->{'tumour_count'}; + + # some fail fast checks + return 1 if($t_count < 4); + return 0 if($t_count >= 10); + + my $dist = 1_000_000_000_000; # I just need a big number to start + + for my $g1 ($group->{'start1'}, $group->{'end1'}) { + for my $g2 ($group->{'start2'}, $group->{'end2'}) { + my $abs_dist = abs $g1 - $g2; + $dist = $abs_dist if($abs_dist < $dist); + } + } + + return 0 if($dist >= 5000 && $t_count >= 4); + return 0 if($dist < 5000 && $t_count >= 10); + return 1; +} + +sub header_map { + my $header_line = shift; + $header_line =~ s/^#\s+//; + my @elements = split /\t/, $header_line; + my (%loc_by_name, %name_by_loc); + my @header_keys; + my @to_end; + my $count = 0; + for my $e(@elements) { + if($e =~ m/_count$/ || $e eq 'brass_score') { + unshift @to_end, $e; + } elsif($e eq 'count' || $e eq 'bkpt_distance') { + # have to be done last + push @to_end, $e; + } else { + push @header_keys, $e; + } + $loc_by_name{$e} = $count; + $name_by_loc{$count} = $e; + $count++; + } + push @header_keys, @to_end; + return (\%loc_by_name, \%name_by_loc, \@header_keys); +} + +sub uniq_list { + my ($key, $a, $b) = @_; + my %d = map {$_ => 1} (split /,/, $a->{$key}),(split /,/, $b->{$key}); + return (join ',', keys %d); +} + +sub l_min { + my ($key, $a, $b) = @_; + return min($a->{$key}, $b->{$key}); +} + +sub l_max { + my ($key, $a, $b) = @_; + return max($a->{$key}, $b->{$key}); +} + +sub l_sum { + my ($key, $a, $b) = @_; + return sum($a->{$key}, $b->{$key}); +} + +sub l_read_count { + my ($key, $a, $b) = @_; + my %d = map {$_ => 1} (split /,/, $a->{'names'}),(split /,/, $b->{'names'}); + my @keys = keys %d; + return scalar @keys; +} + +sub l_join { + my ($key, $a, $b) = @_; + return q{} if($a->{$key} eq q{} && $b->{$key} eq q{}); + return $a->{$key} if($a->{$key} ne q{} && $b->{$key} eq q{}); + return $b->{$key} if($a->{$key} eq q{} && $b->{$key} ne q{}); + return $a->{$key}.','.$b->{$key}; +} + +sub sum_counts { + my ($key, $a, $b) = @_; + # ignore $key + my $sum = 0; + for(keys %{$b}) { + next if($_ !~ m/_count$/); + $sum += $b->{$_}; + } + return $sum; +} + +sub correct_bp { + my ($key, $a, $b) = @_; + if($a->{$key} == -1 || $b->{$key} == -1) { + return -1 if($a->{$key} == $b->{$key}); + die "\n'$key' value combination is impossible for data:\n$a->{_orig}\n$b->{_orig}\n"; + } + return $b->{'start2'} - $b->{'start1'}; +} + +sub die_if_diff { + my ($key, $a, $b) = @_; + if($a->{$key} ne $b->{$key}) { + warn "$key, $a->{$key}, $b->{$key}\n"; + warn Dumper($a, $b); + die "No case provided to handle this difference.\n"; + } +} + +sub no_op { + my ($key, $a, $b) = @_; + return $b->{$key}; +} + +sub merge_repeats { + my ($key, $a, $b) = @_; + # don't really need the key but use it anyway + return q{} if($a->{$key} eq q{} && $b->{$key} eq q{}); + return $a->{$key} if($a->{$key} ne q{} && $b->{$key} eq q{}); + return $b->{$key} if($a->{$key} eq q{} && $b->{$key} ne q{}); + return $a->{$key}.','.$b->{$key}; +} + +sub merge_event { + my ($this, $that, $header_keys) = @_; + for my $key(@{$header_keys}) { + $that->{$key} = $DISPATCH{$key}->($key, $this, $that); + } + return 1; +} + + +1; diff --git a/setup.sh b/setup.sh index 3fca066..361ab77 100755 --- a/setup.sh +++ b/setup.sh @@ -133,6 +133,12 @@ if [[ "x$CHK" == "x" ]] ; then exit 1; fi +CHK=`perl -le 'eval "require $ARGV[0]" and print $ARGV[0]->VERSION' Sanger::CGP::Grass` +if [[ "x$CHK" == "x" ]] ; then + echo "PREREQUISITE: Please install grass before proceeding: https://github.com/cancerit/grass/releases" + exit 1; +fi + #create a location to build dependencies SETUP_DIR=$INIT_DIR/install_tmp mkdir -p $SETUP_DIR