forked from NickSto/dunovo
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathalign.c
192 lines (179 loc) · 4.58 KB
/
align.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define NAIVE_TEST_WINDOW 6
#define NAIVE_TEST_THRES 0.80
#define NAIVE_TEST_MIN 2
#define NAIVE_WINDOW 10
#define NAIVE_THRES 0.80
typedef struct Gap {
int seq;
int coord;
int length;
struct Gap *next;
} Gap;
typedef struct Gaps {
int length;
struct Gap *root;
struct Gap *tip;
} Gaps;
int _test_match(char *seq1, int start1, char *seq2, int start2);
void add_gap(Gaps *gaps, int seq, int coord, int length);
Gaps *make_gaps();
char *insert_gaps(Gaps *gaps, char *seq, int seq_num);
// A naive algorithm for aligning two sequences which are expected to be very similar to each other
// and already nearly aligned.
void naive2(char *seq1, char *seq2) {
Gaps *gaps = make_gaps();
int i = 0;
int j = 0;
int matches = 0;
while (seq1[i] != 0 && seq2[j] != 0) {
// Match?
printf("%c %c | i %d j %d\n", seq1[i], seq2[j], i, j);
if (seq1[i] == seq2[j]) {
matches++;
i++;
j++;
continue;
}
printf("mismatch!\n");
// Mismatch. Start adding gaps until the mismatches go away.
int new_i = i;
int new_j = j;
int gap_seq = 0;
int success;
while (1) {
if (seq1[new_i] == 0 && seq2[new_j] == 0) {
break;
}
success = _test_match(seq1, new_i, seq2, j);
if (success) {
gap_seq = 2;
break;
}
if (seq1[new_i] != 0) {
new_i++;
}
success = _test_match(seq1, i, seq2, new_j);
if (success) {
gap_seq = 1;
break;
}
if (seq2[new_j] != 0) {
new_j++;
}
}
// Which sequence are we putting the gap in?
if (gap_seq == 0) {
printf("No good gap found. new_i: %d, new_j: %d\n", new_i, new_j);
// No good gap found.
} else if (i == new_i && j == new_j) {
printf("No gap required.\n");
} else if (gap_seq == 1) {
printf("%dbp gap in seq1 at base %d.\n", new_j-j, j);
add_gap(gaps, 1, j, new_j-j);
j = new_j;
} else if (gap_seq == 2) {
printf("%dbp gap in seq2 at base %d.\n", new_i-i, i);
add_gap(gaps, 2, i, new_i-i);
i = new_i;
}
i++;
j++;
}
char *new_seq1 = insert_gaps(gaps, seq1, 1);
char *new_seq2 = insert_gaps(gaps, seq2, 2);
printf("alignment:\n%s\n%s\n", new_seq1, new_seq2);
}
// Check if the few bases starting at start1 and start2 in seq1 and seq2, respectively, align with
// few mismatches. The number of bases checked is NAIVE_TEST_WINDOW, and they must have a match
// percentage greater than NAIVE_TEST_THRES. Also, the amount of sequence left to compare must be
// more than NAIVE_TEST_MIN.
int _test_match(char *seq1, int start1, char *seq2, int start2) {
int matches = 0;
int total = 0;
char base1, base2;
int i;
for (i = 0; i < NAIVE_TEST_WINDOW-1; i++) {
base1 = seq1[start1+i];
base2 = seq2[start2+i];
if (base1 == 0 || base2 == 0) {
break;
}
if (base1 == base2) {
matches++;
}
total++;
}
return total > NAIVE_TEST_MIN && (double)matches/total > NAIVE_TEST_THRES;
}
Gaps *make_gaps() {
Gaps *gaps = malloc(sizeof(Gaps));
gaps->root = 0;
gaps->tip = 0;
gaps->length = 0;
return gaps;
}
void add_gap(Gaps *gaps, int seq, int coord, int length) {
Gap *gap = malloc(sizeof(Gap));
gap->next = 0;
gap->seq = seq;
gap->coord = coord;
gap->length = length;
if (gaps->root == 0) {
gaps->root = gap;
} else {
gaps->tip->next = gap;
}
gaps->tip = gap;
gaps->length++;
}
// Take gap information from the aligner and put them into the sequence string as "-" characters.
char *insert_gaps(Gaps *gaps, char *seq, int seq_num) {
if (gaps->root == 0) {
return seq;
}
// How long should the new sequence be?
int extra_len = 0;
Gap *gap = gaps->root;
while (gap) {
if (gap->seq == seq_num) {
extra_len += gap->length;
}
gap = gap->next;
}
//TODO: Handle a situation with no gaps.
int new_len = extra_len + strlen(seq) + 1;
char *new_seq = malloc(sizeof(char) * new_len);
int i = 0;
int j = 0;
gap = gaps->root;
while (gap) {
// Check that it's a gap in our sequence.
if (gap->seq != seq_num) {
gap = gap->next;
continue;
}
// Copy verbatim all the sequence until the gap.
while (i <= gap->coord) {
new_seq[j] = seq[i];
i++;
j++;
}
// Add -'s the whole length of the gap.
while (j < gap->coord + gap->length + 1) {
new_seq[j] = '-';
j++;
}
gap = gap->next;
}
// Fill in the end sequence.
while (seq[i]) {
new_seq[j] = seq[i];
i++;
j++;
}
new_seq[new_len-1] = 0;
return new_seq;
}