FastAlign.pl

I did a bioperl script which uses the heuristic method of tfasty. The query string is in amino acids and the hit string is in nucleotides. There are extra nucleotides at the end of the hit string (option -diff and by default = 10), that allow to verify if the intron start with common rules (5'-GTGCGA-... for group II intron and after an exonic T for group I intron). I'm glad to help the bioperl community!
bioperl · Aug 1, 2013 · 2007f94 · 2007f94
1 parent 27e52d0
commit 2007f94
Showing 1 changed file with 224 additions and 0 deletions.
diff --git a/examples/align/FastAlign.pl b/examples/align/FastAlign.pl
@@ -0,0 +1,224 @@
+#! /usr/bin/perl
+#####################################################
+#                 Fasta
+#                     |
+#                     Align
+#
+#                     By
+#                 Antony Vincent 
+#          (a.vincent.0312@gmail.com)
+#           	
+# FastAlign is a perl script which uses the heuristic method
+# of tfasty36 for find similarity between a query sequence
+# in amino acids and a sequence in nucleotides. It provides
+# a more intuitive output to find exon-intron junctions. 
+# The query string is in amino acids and the hit string is
+# in nucleotides. There are extra nucleotides at the end of
+# the hit string (option -diff and by default = 10), that 
+# allow to verify if the intron start with common rules 
+# (5'-GTGCGA-... for group II intron and after an exonic T 
+# for group I intron).
+# 
+# The FASTA version can be changed by the user by changing
+# the line with tfasty36 for tfastyXX.
+#
+# If you have Emboss, you can genarate a graphic with option
+# -graph 1.
+#
+# For complete help: type perl fastalign.pl -help
+#              Last Update: 01/06/13
+#######################################################
+
+
+=licence
+Copyright (C) 2013  Antony Vincent
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+=cut
+
+
+
+use strict; 
+use Bio::SearchIO;
+use Bio::SeqIO;
+use Getopt::Long;
+use Bio::SeqUtils;
+
+
+## Set the default variables
+	my $identity     = 75;
+	my $length      = 50;
+	my $diff      = 10;
+	my $out      = 'output';
+	my $graphic      = 10;
+	my $query;
+	my $library;
+	my $help;
+
+GetOptions(
+    'seq=s'    => \$query,
+    'db=s'     => \$library, 
+    'graph=s'     => \$graphic,
+    'i=i'     => \$identity,
+    'l=i'     => \$length,
+    'diff=s'    => \$diff,
+    'out=s'     => \$out,   
+    'help!'     => \$help,
+) 
+or die "Incorrect usage! Try perl fastalign.pl -help for an exhaustif help.\n";
+###
+	if( $help ) 
+{ # if start
+    print "\n";
+    print "Two options are required:\n";
+    print "	-seq: Your sequence in amino acids\n";
+    print "	-db: The sequence in nucleotides (Could be a whole genome or a partial sequence...)\n";
+    print "\n";
+    print "There are few miscellaneous options:\n";
+    print "	-i: Minimum identity percentage (default = 75)\n";
+    print "	-l: Minimum match length (default = 50)\n";
+    print "	-diff: Difference between the hit string and the alignement (default = 10)\n";
+    print "	-out: Name of the output file (default = output.txt)\n";
+    print "	-graph: If this option = 1, a graph will be generated (default = no)\n";
+} # if help
+
+	else 
+{ # else start
+my $date = `date`;
+open (WRITE, ">>$out.txt"); ## Open the output file
+print WRITE "		Fasta\n";
+print WRITE "		    |\n";
+print WRITE "		    Align\n\n";
+print WRITE "Date:", $date, "\n";
+print WRITE "PARAMETERS\n";
+print WRITE "Minimum identity =", $identity, "\n";
+print WRITE "Minimum length =", $length, "\n";
+print WRITE "Diff =", $diff, "\n\n";
+
+	if ( $graphic == 1 )
+{
+open (WRITE, ">>$out.txt"); ## Open the output file
+open (WRITE2, ">>lindna.lnp"); ## Open the lindna config file
+
+## start the lindna header
+print WRITE2 "start";
+print WRITE2 "\t";
+print WRITE2 "1";
+print WRITE2 "\n";
+print WRITE2 "End";
+print WRITE2 "\t";
+my $seqio_obj = Bio::SeqIO->new(-file => "$library", -format => "fasta" );
+my $seq_obj = $seqio_obj->next_seq;
+my $count_obj = $seq_obj->length;
+print WRITE2 "$count_obj";
+print WRITE2 "\n\n";
+print WRITE2 "group";
+print WRITE2 "\n";
+}
+	else
+{
+	print "No graphic generated \n";
+}
+## run tfasty36
+my $fh;                   
+my $fasta   = "tfasty36"; # <------ You can change this line for newest fasta algorithm
+
+my $command = "$fasta $query $library";
+
+open $fh,"$command |" || die("cannot run fasta cmd of $command: $!\n");
+
+my $searchio  = Bio::SearchIO->new(-format => 'fasta', -fh => $fh);
+print WRITE "Fasta algorithm:", $fasta, "\n\n";
+			## start the parsing part of the script
+
+while( my $result = $searchio->next_result ) {
+  ## $result is a Bio::Search::Result::ResultI compliant object
+  while( my $hit = $result->next_hit ) {
+    ## $hit is a Bio::Search::Hit::HitI compliant object
+    while( my $hsp = $hit->next_hsp ) {
+      ## $hsp is a Bio::Search::HSP::HSPI compliant object
+      if( $hsp->length('total') > $length ) {
+        if ( $hsp->percent_identity >= $identity ) {
+
+          		## Generals informations
+print WRITE "Rank=", $hsp->rank, "\n",
+            "Query=",   $result->query_name, "\n",
+            "Hit=",        $hit->name, "\n" ,
+            "Length=",     $hsp->length('total'), "\n",
+            "Percent_id=", $hsp->percent_identity, "\n",
+	    "Strand=", $hsp->strand('hit'), "\n";
+
+
+			print WRITE "\n";
+
+
+
+
+			## Search for nucleotide sequences
+			print WRITE "\n";
+	my $start_hit = $hsp->start('hit'), "\n";
+	my $end_hit = $hsp->end('hit') , "\n";	
+   	my $in  = Bio::SeqIO->new(-file => "$library" , '-format' => 'fasta');
+
+    while ( my $seq = $in->next_seq() ) {#1
+
+		## looking for query position
+			my $start_query = $hsp->start('query'), "\n";
+			my $end_query = $hsp->end('query') , "\n";
+		## aa_to_3aa
+        my $seqobj = Bio::PrimarySeq->new ( -seq => $hsp->query_string);
+	my $polypeptide_3char = Bio::SeqUtils->seq3($seqobj);
+
+                ## modify the homology string
+      my $homo = $hsp->homology_string;
+         $homo =~ s/:/|||/g;
+         $homo =~ s/\./***/g;
+         $homo =~ s/ /XXX/g;
+
+
+			## HSP
+
+	print WRITE "Query($start_query,$end_query)\n";
+	print WRITE "Hit($start_hit,$end_hit)\n\n";
+	print WRITE $polypeptide_3char, "\n";
+	print WRITE $homo, "\n";
+	print WRITE $seq->subseq($start_hit,$end_hit+$diff), "\n";
+
+
+        if ( $graphic == 1)
+{ ## if start
+	 ## write in lindna file
+	print WRITE2 "label", "\n", "Block", "\t", "$start_hit", "\t",
+                      "$end_hit", "\t", "3", "\t", "H", "\n";
+        print WRITE2 "Exon", $hsp->rank, "\n";
+        print WRITE2 "endlabel";
+        print WRITE2 "\n\n";
+} ## if end
+	else
+{print "No lindna file generated\n";}
+} #1
+print WRITE "\n";
+
+        }
+      }
+    }  
+  }
+}
+        if ( $graphic == 1)
+{ ## if start
+print WRITE2 "endgroup";
+system ("lindna -infile lindna.lnp -ruler y -blocktype filled -graphout svg");
+system ("rm *.lnp");
+} ## if end
+} # else end