-
Notifications
You must be signed in to change notification settings - Fork 2
/
addAlignedGaps.pl
executable file
·256 lines (232 loc) · 10.2 KB
/
addAlignedGaps.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
#!/usr/bin/env perl
use warnings;
use strict;
use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
use IO::Compress::Gzip qw(gzip $GzipError);
use Pod::Usage;
use Getopt::Long qw(GetOptions);
Getopt::Long::Configure qw(gnu_getopt);
#Diagnostics:
use Data::Dumper qw(Dumper);
######################################################################
# addAlignedGaps.pl #
# Usage: #
# addAlignedGaps.pl [-a input alignment] [-i unaligned FASTAs] #
# [-o output aligned FASTA] #
# #
# Arguments: #
# -a,--input_alignment Aligned multi-FASTA of references where #
# the prefix up to _ is the species name #
# (mandatory, no default) #
# -i,--input_FASTA FASTA of pseudoreferences to put into the #
# aligned coordinate space, where prefixes #
# up to _ are the species name, matching one #
# of the species in the alignment #
# (default: STDIN) #
# -o,--output_alignment Filename for output aligned multi-FASTA #
# containing sequences from both the input #
# alignment and gap-inserted versions of the #
# unaligned sequences #
# (default: STDOUT) #
# Description: #
# addAlignedGaps.pl produces an aligned FASTA of all sequences based#
# on adding gaps found in an alignment of reference sequences to a #
# set of unaligned sequences, each set in an identifiable reference #
# coordinate space. We use the prefix of the FASTA header before the#
# first _ to identify the reference, and to match to equivalent #
# prefixes in the unaligned FASTA headers. #
######################################################################
my $SCRIPTNAME = "addAlignedGaps.pl";
my $VERSION = "1.0";
=pod
=head1 NAME
addAlignedGaps.pl - Add gaps to unaligned sequences and append to MSA
=head1 SYNOPSIS
addAlignedGaps.pl [options]
Options:
--help,-h,-? Display this help documentation
-a,--input_alignment Aligned multi-FASTA of references where
the prefix up to _ is the species name
(mandatory, no default)
-i,--input_FASTA FASTA of pseudoreferences to put into the
aligned coordinate space, where prefixes
up to _ are the species name, matching one
of the species in the alignment
(default: STDIN)
-o,--output_alignment Filename for output aligned multi-FASTA
containing sequences from both the input
alignment and gap-inserted versions of the
unaligned sequences
(default: STDOUT)
--version,-v Output version string
--debug,-d Print diagnostic output to STDERR
=head1 DESCRIPTION
addAlignedGaps.pl produces an aligned FASTA of all sequences based
on adding gaps found in an alignment of reference sequences to a
set of unaligned sequences, each set in an identifiable reference
coordinate space. We use the prefix of the FASTA header before the
first _ to identify the reference, and to match to equivalent
prefixes in the unaligned FASTA headers.
=cut
sub gapPositions($) {
#Run-length encode the gaps so that inserting terminal gaps is possible
# with splice()
my $seq = shift @_;
my @gap_positions = ([0,0]); #This is actually an array of arrays, RLE
my @bases = split //, $seq;
my $seqlen = scalar(@bases);
for (my $i = 0; $i < $seqlen; $i++) {
if ($bases[$i] eq "-") {
if ($gap_positions[$#gap_positions][0]+$gap_positions[$#gap_positions][1] == $i) {
#Extend the previous gap by one:
my @revised_gap = @{$gap_positions[$#gap_positions]};
$revised_gap[1]++;
splice(@gap_positions, $#gap_positions, 1, \@revised_gap);
} else {
#Create a new gap of length 1:
push @gap_positions, [$i, 1];
}
}
}
return \@gap_positions;
}
sub addGaps($$$) {
my $seq = shift @_;
my $species = shift @_;
my %gap_sequences = %{shift @_;};
my @gap_sequence = ();
if (exists($gap_sequences{$species})) {
@gap_sequence = @{$gap_sequences{$species}};
} else {
print STDERR "Species ${species} found in unaligned FASTA but not in alignment\n";
return undef;
}
my @gapped_sequence = split //, $seq; #We're going to splice in gaps *forwards*
#Why forwards? Because our RLE is *in* aligned coordinate space, not unaligned coordinate space
for (my $i = 0; $i <= $#gap_sequence; $i++) {
print STDERR "splice will give warning for ", $gap_sequence[$i][0], " gap of length ", $gap_sequence[$i][1], " because array is of size ", scalar(@gapped_sequence), "\n" if $gap_sequence[$i][0] > scalar(@gapped_sequence);
splice(@gapped_sequence, $gap_sequence[$i][0], 0, ('-')x$gap_sequence[$i][1]);
}
return join("", @gapped_sequence);
}
#Parse options:
my $help = 0;
my $man = 0;
my $aln_path = '';
my $in_path = '';
my $out_path = '';
my $dispversion = 0;
my $debug = 0;
GetOptions('input_alignment|a=s' => \$aln_path, 'input_FASTA|i=s' => \$in_path, 'output_FASTA|o=s' => \$out_path, 'version|v' => \$dispversion, 'help|h|?+' => \$help, 'debug|d+' => \$debug, man => \$man) or pod2usage(2);
pod2usage(-exitval => 1, -verbose => $help, -output => \*STDERR) if $help;
pod2usage(-exitval => 0, -output => \*STDERR, -verbose => 2) if $man;
print STDERR "${SCRIPTNAME} version ${VERSION}\n" if $dispversion;
exit 0 if $dispversion;
#Open input alignment and output files:
my $aln;
if (! -e $aln_path) {
print STDERR "Unable to find input aligned FASTA ${aln_path}, exiting\n";
exit 2;
}
open $aln, "<", $aln_path or die "Failed to open aligned FASTA file ${aln_path} due to error $!\n";
my $out;
if ($out_path =~ /\.gz$/) {
$out = new IO::Compress::Gzip $out_path or die "Failed to create Gzipped output FASTA file $out_path due to error $GzipError\n";
} elsif ($out_path eq '') {
open $out, ">&", \*STDOUT or die "Failed to duplicate STDOUT file handle for output FASTA due to error $!\n";
} else {
open $out, ">", $out_path or die "Failed to open output FASTA file ${out_path} due to error $!\n";
}
#Read in the necessary gap sequences from the alignment:
print STDERR "Reading in gap sequences from alignment\n" if $debug;
my %gap_sequences = ();
my $species = "";
my $gap_sequence = "";
while (my $line = <$aln>) {
print $out $line; #Feed through to the output before processing
if ($line =~ /^>/) { #Header line
if ($gap_sequence ne "" and $species ne "") {
$gap_sequences{$species} = gapPositions($gap_sequence);
print STDERR "${species}\n" if $debug > 1;
print STDERR Dumper $gap_sequences{$species} if $debug > 1;
$gap_sequence = "";
}
if ($line =~ /^>([A-Za-z0-9.-]+)_/) {
$species = $1; #Store the species name/ID via regex capture
} else {
print STDERR "Unable to detect an appropriate species name in FASTA header ${line}\nExpecting alphanumeric sequence before _\n";
close $aln;
close $out;
exit 3;
}
} else {
chomp $line;
$gap_sequence .= $line; #Add sequence to buffer
}
}
if ($gap_sequence ne "" and $species ne "") {
$gap_sequences{$species} = gapPositions($gap_sequence);
print STDERR "${species}\n" if $debug > 1;
print STDERR Dumper $gap_sequences{$species} if $debug > 1;
$gap_sequence = "";
}
close $aln;
print STDERR "Done reading in gap sequences from alignment\n" if $debug;
#Open up the unaligned sequence FASTA for streaming:
print STDERR "Adding gaps to unaligned sequences\n" if $debug;
my $in;
if ($in_path =~ /\.gz$/) {
$in = new IO::Uncompress::Gunzip $in_path or die "Failed to open Gzipped input FASTA file $in_path due to error $GunzipError\n";
} elsif ($in_path eq '') {
open $in, "<&", \*STDIN or die "Failed to duplicate STDIN file handle for input FASTA due to error $!\n";
} else {
open $in, "<", $in_path or die "Failed to open input FASTA file ${in_path} due to error $!\n";
}
#Iterate over the scaffolds:
$species = ""; #Keep track of the species prefix to extract the gap sequence
my $sequence = ""; #Store sequence progressively for wrapped FASTAs
while (my $line = <$in>) {
if ($line =~ />/) {
if ($sequence ne "" and $species ne "") {
my $gapped_sequence = addGaps($sequence, $species, \%gap_sequences);
unless (defined($gapped_sequence)) { #Catch the exception when species isn't in the hash
close $in;
close $out;
exit 5;
}
print STDERR $species, "\n" if $debug > 1;
print STDERR Dumper $gap_sequences{$species} if $debug > 1;
print $out $gapped_sequence, "\n";
$sequence = "";
}
print $out $line;
if ($line =~ /^>([A-Za-z0-9]+)_/) {
$species = $1; #Store the species name/ID via regex capture
} else {
print STDERR "Unable to detect an appropriate species name in FASTA header ${line}\nExpecting alphanumeric sequence before _\n";
close $in;
close $out;
exit 4;
}
} else {
chomp $line;
$sequence .= $line; #Add line to the sequence buffer
}
}
#Make sure to catch the final sequence:
if ($sequence ne "" and $species ne "") {
my $gapped_sequence = addGaps($sequence, $species, \%gap_sequences);
unless (defined($gapped_sequence)) { #Catch the exception when species isn't in the hash
close $in;
close $out;
exit 5;
}
print STDERR $species, "\n" if $debug > 1;
print STDERR Dumper $gap_sequences{$species} if $debug > 1;
print $out $gapped_sequence, "\n";
$sequence = "";
}
#Close input and output files:
close $in;
close $out;
exit 0;