Skip to content

Commit

Permalink
Merge pull request #108 from cancerit/feature/ignore_contigs_file
Browse files Browse the repository at this point in the history
Feature/ignore contigs file
  • Loading branch information
davidrajones authored Mar 29, 2022
2 parents 19244b4 + ce81a0e commit c07ff77
Show file tree
Hide file tree
Showing 6 changed files with 175 additions and 27 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@
/perl/pm_to_blib
.idea/*
/python/env
.vstags

30 changes: 15 additions & 15 deletions perl/bin/pindel.pl
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ BEGIN
use Sanger::CGP::Pindel::Implement;

const my @VALID_PROCESS => qw(input pindel pin2vcf merge flag);
const my @RM_PROCESS => qw( process index limit exclude excludef badloci apid);
my %index_max = ( 'input' => 2,
'pindel' => -1,
'pin2vcf' => -1,
Expand Down Expand Up @@ -112,6 +113,7 @@ sub setup {
't|tumour=s' => \$opts{'tumour'},
'n|normal=s' => \$opts{'normal'},
'e|exclude=s' => \$opts{'exclude'},
'ef|exclude-file=s' => \$opts{'excludef'},
'b|badloci=s' => \$opts{'badloci'},
# these are specifically for pin2vcf
'sp|species=s{0,}' => \@{$opts{'species'}},
Expand Down Expand Up @@ -150,6 +152,7 @@ sub setup {
PCAP::Cli::file_for_reading('reference', $opts{'reference'});
PCAP::Cli::file_for_reading('tumour', $opts{'tumour'});
PCAP::Cli::file_for_reading('normal', $opts{'normal'});
PCAP::Cli::file_for_reading('exclude file', $opts{'excludef'}) if(defined $opts{'excludef'});
unless($opts{'noflag'}) {
PCAP::Cli::file_for_reading('simrep', $opts{'simrep'});
PCAP::Cli::file_for_reading('filters', $opts{'filters'});
Expand All @@ -164,14 +167,9 @@ sub setup {
exit 0;
}


delete $opts{'process'} unless(defined $opts{'process'});
delete $opts{'index'} unless(defined $opts{'index'});
delete $opts{'limit'} unless(defined $opts{'limit'});

delete $opts{'exclude'} unless(defined $opts{'exclude'});
delete $opts{'badloci'} unless(defined $opts{'badloci'});
delete $opts{'apid'} unless(defined $opts{'apid'});
for my $title (@RM_PROCESS){
delete $opts{$title} unless(defined $opts{$title});
}

if(exists $opts{'process'}) {
PCAP::Cli::valid_process('process', $opts{'process'}, \@VALID_PROCESS);
Expand Down Expand Up @@ -263,13 +261,15 @@ =head1 SYNOPSIS
- not necessary for external use
Optional
-seqtype -st Sequencing protocol, expect all input to match [WGS]
-assembly -as Name of assembly in use
- when not available in BAM header SQ line.
-species -sp Species
- when not available in BAM header SQ line.
-exclude -e Exclude this list of ref sequences from processing, wildcard '%'
- comma separated, e.g. NC_007605,hs37d5,GL%
-seqtype -st Sequencing protocol, expect all input to match [WGS]
-assembly -as Name of assembly in use
- when not available in BAM header SQ line.
-species -sp Species
- when not available in BAM header SQ line.
-exclude -e Exclude this list of ref sequences from processing, wildcard '%'
- comma separated, e.g. NC_007605,hs37d5,GL%
-exclude-file -ef File containing a list of ref sequences to exclude from processing, wildcard '%'
- can be used in place of the -e|-exclude option
-badloci -b Tabix indexed BED file of locations to not accept as anchors
- e.g. hi-seq depth from UCSC
-skipgerm -sg Don't output events with more evidence in normal BAM.
Expand Down
29 changes: 17 additions & 12 deletions perl/lib/Sanger/CGP/Pindel/Implement.pm
Original file line number Diff line number Diff line change
Expand Up @@ -333,20 +333,25 @@ sub valid_seqs {
while(<$FAI_IN>) { push @all_seqs, (split /\t/, $_)[0]; }
close $FAI_IN;

my @exclude = ();
my @exclude_patt = ();
if(exists $options->{'exclude'}) {
my @exclude = split /,/, $options->{'exclude'};
my @exclude_patt;
for my $ex(@exclude) {
$ex =~ s/%/.+/;
push @exclude_patt, $ex;
}

for my $sq(@all_seqs) {
push @good_seqs, $sq unless(first { $sq =~ m/^$_$/ } @exclude_patt);
}
@exclude = split /,/, $options->{'exclude'};
}elsif(exists $options->{'excludef'}){
open (my $READ, '<', $options->{'excludef'}) or croak("Error opening exclue contigs file '".$options->{'excludef'}."' for reading: ".$!);
while(<$READ>){
my $line = $_;
chomp($line);
push(@exclude, $line);
}
close($READ);
}
for my $ex(@exclude) {
$ex =~ s/%/.+/;
push @exclude_patt, $ex;
}
else {
push @good_seqs, @all_seqs;
for my $sq(@all_seqs) {
push @good_seqs, $sq unless(first { $sq =~ m/^$_$/ } @exclude_patt);
}
return @good_seqs;
}
Expand Down
51 changes: 51 additions & 0 deletions perl/t/3_implement.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
########## LICENCE ##########
# Copyright (c) 2014-2022 Genome Research Ltd.
#
# Author: CASM/Cancer IT <cgphelp@sanger.ac.uk>
#
# This file is part of cgpPindel.
#
# cgpPindel is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
########## LICENCE ##########


use strict;
use Data::Dumper;
use Test::More tests => 1;

use Sanger::CGP::Pindel::Implement;

use FindBin qw($Bin);
my $lib_path = "$Bin/../lib";
my $test_data_path = "$Bin/../t/data/";

my $reference = $test_data_path.'genome_excludeseq.fa';
my $exclude_str = 'MT,GL%,hs37d5,NC_007605';
my $exclude_file_regex = $test_data_path.'test_exclude.txt';
my @exp_seqs = qw( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y);


subtest 'Test valid_seqs' => sub {
my %options = ('reference' => $reference,
'exclude' => $exclude_str);

my @good_seqs = Sanger::CGP::Pindel::Implement::valid_seqs(\%options);
is_deeply(\@good_seqs,\@exp_seqs,'Exclude string');

%options = ('reference' => $reference,
'excludef' => $exclude_file_regex,);
@good_seqs = Sanger::CGP::Pindel::Implement::valid_seqs(\%options);
is_deeply(\@good_seqs,\@exp_seqs,'Exclude file');
done_testing();
};
86 changes: 86 additions & 0 deletions perl/t/data/genome_excludeseq.fa.fai
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
1 249250621 52 60 61
2 243199373 253404903 60 61
3 198022430 500657651 60 61
4 191154276 701980507 60 61
5 180915260 896320740 60 61
6 171115067 1080251307 60 61
7 159138663 1254218344 60 61
8 146364022 1416009371 60 61
9 141213431 1564812846 60 61
10 135534747 1708379889 60 61
11 135006516 1846173603 60 61
12 133851895 1983430282 60 61
13 115169878 2119513096 60 61
14 107349540 2236602526 60 61
15 102531392 2345741279 60 61
16 90354753 2449981581 60 61
17 81195210 2541842300 60 61
18 78077248 2624390817 60 61
19 59128983 2703769406 60 61
20 63025520 2763883926 60 61
21 48129895 2827959925 60 61
22 51304566 2876892038 60 61
X 155270560 2929051733 60 61
Y 59373566 3086910193 60 61
MT 16569 3147273397 70 71
GL000207.1 4262 3147290264 60 61
GL000226.1 15008 3147294660 60 61
GL000229.1 19913 3147309981 60 61
GL000231.1 27386 3147330288 60 61
GL000210.1 27682 3147358193 60 61
GL000239.1 33824 3147386399 60 61
GL000235.1 34474 3147420849 60 61
GL000201.1 36148 3147455960 60 61
GL000247.1 36422 3147492773 60 61
GL000245.1 36651 3147529865 60 61
GL000197.1 37175 3147567189 60 61
GL000203.1 37498 3147605046 60 61
GL000246.1 38154 3147643231 60 61
GL000249.1 38502 3147682083 60 61
GL000196.1 38914 3147721289 60 61
GL000248.1 39786 3147760914 60 61
GL000244.1 39929 3147801426 60 61
GL000238.1 39939 3147842083 60 61
GL000202.1 40103 3147882750 60 61
GL000234.1 40531 3147923584 60 61
GL000232.1 40652 3147964853 60 61
GL000206.1 41001 3148006245 60 61
GL000240.1 41933 3148047992 60 61
GL000236.1 41934 3148090686 60 61
GL000241.1 42152 3148133381 60 61
GL000243.1 43341 3148176298 60 61
GL000242.1 43523 3148220424 60 61
GL000230.1 43691 3148264735 60 61
GL000237.1 45867 3148309217 60 61
GL000233.1 45941 3148355911 60 61
GL000204.1 81310 3148402680 60 61
GL000198.1 90085 3148485408 60 61
GL000208.1 92689 3148577057 60 61
GL000191.1 106433 3148671354 60 61
GL000227.1 128374 3148779624 60 61
GL000228.1 129120 3148910201 60 61
GL000214.1 137718 3149041536 60 61
GL000221.1 155397 3149181613 60 61
GL000209.1 159169 3149339663 60 61
GL000218.1 161147 3149501548 60 61
GL000220.1 161802 3149665444 60 61
GL000213.1 164239 3149830006 60 61
GL000211.1 166566 3149997046 60 61
GL000199.1 169874 3150166452 60 61
GL000217.1 172149 3150339221 60 61
GL000216.1 172294 3150514303 60 61
GL000215.1 172545 3150689532 60 61
GL000205.1 174588 3150865016 60 61
GL000219.1 179198 3151042577 60 61
GL000224.1 179693 3151224825 60 61
GL000223.1 180455 3151407576 60 61
GL000195.1 182896 3151591102 60 61
GL000212.1 186858 3151777110 60 61
GL000222.1 186861 3151967146 60 61
GL000200.1 187035 3152157185 60 61
GL000193.1 189789 3152347401 60 61
GL000194.1 191469 3152540417 60 61
GL000225.1 211173 3152735141 60 61
GL000192.1 547496 3152949897 60 61
NC_007605 171823 3153506529 60 61
hs37d5 35477943 3153681224 60 61
4 changes: 4 additions & 0 deletions perl/t/data/test_exclude.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
MT
GL%
hs37d5
NC_007605

0 comments on commit c07ff77

Please sign in to comment.