Skip to content

Commit 27b6efa

Browse files
committed
Support of arbitrary alphabets. Removed BLOSUM.
1 parent e28b21c commit 27b6efa

File tree

4 files changed

+21
-82
lines changed

4 files changed

+21
-82
lines changed

BLOSUM62

Lines changed: 0 additions & 24 deletions
This file was deleted.

README

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ Pairwise alignment in Python.
22

33
My contribution will be:
44
* Code cleanup
5-
* Support of arbitrary alphabets of the string to be aligned
5+
* Support of arbitrary alphabets of input strings (no similarity matrix)
66
* Support of both variants:
77
1. Smith-Waterman = local alignment
88
2. Needleman-Wunsch = global alignment

water

Lines changed: 19 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -25,63 +25,34 @@ f2=open(sys.argv[2], 'r')
2525
seq2=f2.readline()
2626
seq2=string.strip(seq2)
2727

28-
#read in BLOSUM62 matrix
29-
f3=open(sys.argv[3],'r')
30-
BLOSUM62=[]
31-
for line in f3.readlines():
32-
BLOSUM62.append(map(int, line.split()))
33-
34-
#define similar amino acids
35-
similarAA=['ST','TS','SP','PS','SA','AS','SG','GS','TP','PT','TA','AT','TG','GT','PA','AP','PG','GP','AG','GA','ND','DN','NE','EN','NQ','QN','DE','ED','DQ',
36-
'QD','EQ','QE','HR','RH','HK','KH','RK','KR','MI','IM','ML','LM','MV','VM','IL','LI','IV','VI','LV','VL','FY','YF','FW','WF','YW','WY'];
37-
3828
m,n = len(seq1),len(seq2) #length of two sequences
3929

40-
penalty=-4; #define the gap penalty
30+
match_award = 10
31+
mismatch_penalty = -5
32+
gap_penalty = -5 # both for opening and extanding
4133

4234
#generate DP table and traceback path pointer matrix
4335
score=zeros((m+1,n+1)) #the DP table
4436
pointer=zeros((m+1,n+1)) #to store the traceback path
4537

4638
P=0;
4739

48-
def match_score(alpha,beta,BLOSUM62): #the function to find match/dismatch score from BLOSUM62 by letters of AAs
49-
alphabet={}
50-
alphabet["A"] = 0
51-
alphabet["R"] = 1
52-
alphabet["N"] = 2
53-
alphabet["D"] = 3
54-
alphabet["C"] = 4
55-
alphabet["Q"] = 5
56-
alphabet["E"] = 6
57-
alphabet["G"] = 7
58-
alphabet["H"] = 8
59-
alphabet["I"] = 9
60-
alphabet["L"] = 10
61-
alphabet["K"] = 11
62-
alphabet["M"] = 12
63-
alphabet["F"] = 13
64-
alphabet["P"] = 14
65-
alphabet["S"] = 15
66-
alphabet["T"] = 16
67-
alphabet["W"] = 17
68-
alphabet["Y"] = 18
69-
alphabet["V"] = 19
70-
alphabet["B"] = 20
71-
alphabet["Z"] = 21
72-
alphabet["X"] = 22
73-
lut_x=alphabet[alpha]
74-
lut_y=alphabet[beta]
75-
return BLOSUM62[lut_x][lut_y]
40+
def match_score(alpha, beta):
41+
if alpha == beta:
42+
return match_award
43+
elif alpha == '-' or beta == '-':
44+
return gap_penalty
45+
else:
46+
return mismatch_penalty
7647

7748
max_score=P; #initial maximum score in DP table
7849

7950
#calculate DP table and mark pointers
8051
for i in range(1,m+1):
8152
for j in range(1,n+1):
82-
score_up=score[i-1][j]+penalty;
83-
score_down=score[i][j-1]+penalty;
84-
score_diagonal=score[i-1][j-1]+match_score(seq1[i-1],seq2[j-1],BLOSUM62);
53+
score_up=score[i-1][j] + gap_penalty
54+
score_down=score[i][j-1] + gap_penalty
55+
score_diagonal=score[i-1][j-1]+match_score(seq1[i-1],seq2[j-1]);
8556
score[i][j]=max(0,score_up,score_down,score_diagonal);
8657
if score[i][j]==0:
8758
pointer[i][j]=0; #0 means end of the path
@@ -125,32 +96,24 @@ align2=align2[::-1]; #reverse sequence 2
12596
i,j=0,0;
12697

12798
#calcuate identity, similarity, score and aligned sequeces
128-
def result(align1,align2,BLOSUM62):
99+
def result(align1,align2):
129100
symbol='';
130101
found=0;
131102
score=0;
132-
penalty=-4;
133103
identity,similarity=0,0;
134104
for i in range(0,len(align1)):
135105
if align1[i]==align2[i]: #if two AAs are the same, then output the letter
136106
symbol=symbol+align1[i];
137107
identity=identity+1;
138108
similarity=similarity+1;
139-
score=score+match_score(align1[i],align2[i],BLOSUM62);
109+
score=score+match_score(align1[i],align2[i]);
140110
elif align1[i]!=align2[i] and align1[i]!='-' and align2[i]!='-': #if they are not identical and none of them is gap
141-
score=score+match_score(align1[i],align2[i],BLOSUM62);
142-
for j in range(0,len(similarAA)-1):
143-
if align1[i]+align2[i]==similarAA[j]: #search whether these two AAs form a pair in similar AA table
144-
found=1;
145-
if found==1:
146-
symbol=symbol+':'; #if they are similar AA, output ':'
147-
similarity=similarity+1;
148-
if found==0:
149-
symbol=symbol+' '; #o/w, output a space
111+
score=score+match_score(align1[i],align2[i]);
112+
symbol += ' '
150113
found=0;
151114
elif align1[i]=='-' or align2[i]=='-': #if one of them is a gap, output a space
152115
symbol=symbol+' ';
153-
score=score+penalty;
116+
score = score + gap_penalty
154117

155118
identity=float(identity)/len(align1)*100;
156119
similarity=float(similarity)/len(align2)*100;
@@ -163,4 +126,4 @@ def result(align1,align2,BLOSUM62):
163126
print align2
164127
#END of function result
165128

166-
result(align1,align2,BLOSUM62)
129+
result(align1,align2)

water-test

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
#!/bin/bash
22

3-
./water <(echo NILDCQGAAADLIN) <(echo NILDCQGHHHHDLIN) BLOSUM62
3+
./water <(echo ALEXNILDCQGAAA%LIN@%) <(echo PYTHONNILDCQGHHHH%LINJJBB)

0 commit comments

Comments
 (0)