Copied from SVN revision 4710 and adapted to Git.
[samtools.git] / kaln.c
1 /* The MIT License
2
3    Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
4
5    Permission is hereby granted, free of charge, to any person obtaining
6    a copy of this software and associated documentation files (the
7    "Software"), to deal in the Software without restriction, including
8    without limitation the rights to use, copy, modify, merge, publish,
9    distribute, sublicense, and/or sell copies of the Software, and to
10    permit persons to whom the Software is furnished to do so, subject to
11    the following conditions:
12
13    The above copyright notice and this permission notice shall be
14    included in all copies or substantial portions of the Software.
15
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23    SOFTWARE.
24 */
25
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <stdint.h>
30 #include "kaln.h"
31
32 #define FROM_M 0
33 #define FROM_I 1
34 #define FROM_D 2
35
36 typedef struct {
37         int i, j;
38         unsigned char ctype;
39 } path_t;
40
41 int aln_sm_blosum62[] = {
42 /*       A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  *  X */
43          4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
44         -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
45         -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
46         -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
47          0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
48         -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
49         -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
50          0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
51         -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
52         -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
53         -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
54         -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
55         -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
56         -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
57         -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
58          1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
59          0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
60         -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
61         -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
62          0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
63         -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
64          0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
65 };
66
67 int aln_sm_blast[] = {
68         1, -3, -3, -3, -2,
69         -3, 1, -3, -3, -2,
70         -3, -3, 1, -3, -2,
71         -3, -3, -3, 1, -2,
72         -2, -2, -2, -2, -2
73 };
74
75 ka_param_t ka_param_blast = {  5,  2,  2, aln_sm_blast, 5, 50 };
76 ka_param_t ka_param_aa2aa = { 10,  2,  2, aln_sm_blosum62, 22, 50 };
77
78 static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
79 {
80         int i, n;
81         uint32_t *cigar;
82         unsigned char last_type;
83
84         if (path_len == 0 || path == 0) {
85                 *n_cigar = 0;
86                 return 0;
87         }
88
89         last_type = path->ctype;
90         for (i = n = 1; i < path_len; ++i) {
91                 if (last_type != path[i].ctype) ++n;
92                 last_type = path[i].ctype;
93         }
94         *n_cigar = n;
95         cigar = (uint32_t*)calloc(*n_cigar, 4);
96
97         cigar[0] = 1u << 4 | path[path_len-1].ctype;
98         last_type = path[path_len-1].ctype;
99         for (i = path_len - 2, n = 0; i >= 0; --i) {
100                 if (path[i].ctype == last_type) cigar[n] += 1u << 4;
101                 else {
102                         cigar[++n] = 1u << 4 | path[i].ctype;
103                         last_type = path[i].ctype;
104                 }
105         }
106
107         return cigar;
108 }
109
110 /***************************/
111 /* START OF common_align.c */
112 /***************************/
113
114 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
115
116 #define set_M(MM, cur, p, sc)                                                   \
117 {                                                                                                               \
118         if ((p)->M >= (p)->I) {                                                         \
119                 if ((p)->M >= (p)->D) {                                                 \
120                         (MM) = (p)->M + (sc); (cur)->Mt = FROM_M;       \
121                 } else {                                                                                \
122                         (MM) = (p)->D + (sc); (cur)->Mt = FROM_D;       \
123                 }                                                                                               \
124         } else {                                                                                        \
125                 if ((p)->I > (p)->D) {                                                  \
126                         (MM) = (p)->I + (sc); (cur)->Mt = FROM_I;       \
127                 } else {                                                                                \
128                         (MM) = (p)->D + (sc); (cur)->Mt = FROM_D;       \
129                 }                                                                                               \
130         }                                                                                                       \
131 }
132 #define set_I(II, cur, p)                                                               \
133 {                                                                                                               \
134         if ((p)->M - gap_open > (p)->I) {                                       \
135                 (cur)->It = FROM_M;                                                             \
136                 (II) = (p)->M - gap_open - gap_ext;                             \
137         } else {                                                                                        \
138                 (cur)->It = FROM_I;                                                             \
139                 (II) = (p)->I - gap_ext;                                                \
140         }                                                                                                       \
141 }
142 #define set_end_I(II, cur, p)                                                   \
143 {                                                                                                               \
144         if (gap_end >= 0) {                                                                     \
145                 if ((p)->M - gap_open > (p)->I) {                               \
146                         (cur)->It = FROM_M;                                                     \
147                         (II) = (p)->M - gap_open - gap_end;                     \
148                 } else {                                                                                \
149                         (cur)->It = FROM_I;                                                     \
150                         (II) = (p)->I - gap_end;                                        \
151                 }                                                                                               \
152         } else set_I(II, cur, p);                                                       \
153 }
154 #define set_D(DD, cur, p)                                                               \
155 {                                                                                                               \
156         if ((p)->M - gap_open > (p)->D) {                                       \
157                 (cur)->Dt = FROM_M;                                                             \
158                 (DD) = (p)->M - gap_open - gap_ext;                             \
159         } else {                                                                                        \
160                 (cur)->Dt = FROM_D;                                                             \
161                 (DD) = (p)->D - gap_ext;                                                \
162         }                                                                                                       \
163 }
164 #define set_end_D(DD, cur, p)                                                   \
165 {                                                                                                               \
166         if (gap_end >= 0) {                                                                     \
167                 if ((p)->M - gap_open > (p)->D) {                               \
168                         (cur)->Dt = FROM_M;                                                     \
169                         (DD) = (p)->M - gap_open - gap_end;                     \
170                 } else {                                                                                \
171                         (cur)->Dt = FROM_D;                                                     \
172                         (DD) = (p)->D - gap_end;                                        \
173                 }                                                                                               \
174         } else set_D(DD, cur, p);                                                       \
175 }
176
177 typedef struct {
178         uint8_t Mt:3, It:2, Dt:2;
179 } dpcell_t;
180
181 typedef struct {
182         int M, I, D;
183 } dpscore_t;
184
185 /***************************
186  * banded global alignment *
187  ***************************/
188 uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
189 {
190         int i, j;
191         dpcell_t **dpcell, *q;
192         dpscore_t *curr, *last, *s;
193         int b1, b2, tmp_end;
194         int *mat, end, max = 0;
195         uint8_t type, ctype;
196         uint32_t *cigar = 0;
197
198         int gap_open, gap_ext, gap_end, b;
199         int *score_matrix, N_MATRIX_ROW;
200
201         /* initialize some align-related parameters. just for compatibility */
202         gap_open = ap->gap_open;
203         gap_ext = ap->gap_ext;
204         gap_end = ap->gap_end;
205         b = ap->band_width;
206         score_matrix = ap->matrix;
207         N_MATRIX_ROW = ap->row;
208
209         *n_cigar = 0;
210         if (len1 == 0 || len2 == 0) return 0;
211
212         /* calculate b1 and b2 */
213         if (len1 > len2) {
214                 b1 = len1 - len2 + b;
215                 b2 = b;
216         } else {
217                 b1 = b;
218                 b2 = len2 - len1 + b;
219         }
220         if (b1 > len1) b1 = len1;
221         if (b2 > len2) b2 = len2;
222         --seq1; --seq2;
223
224         /* allocate memory */
225         end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
226         dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
227         for (j = 0; j <= len2; ++j)
228                 dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
229         for (j = b2 + 1; j <= len2; ++j)
230                 dpcell[j] -= j - b2;
231         curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
232         last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
233         
234         /* set first row */
235         SET_INF(*curr); curr->M = 0;
236         for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
237                 SET_INF(*s);
238                 set_end_D(s->D, dpcell[0] + i, s - 1);
239         }
240         s = curr; curr = last; last = s;
241
242         /* core dynamic programming, part 1 */
243         tmp_end = (b2 < len2)? b2 : len2 - 1;
244         for (j = 1; j <= tmp_end; ++j) {
245                 q = dpcell[j]; s = curr; SET_INF(*s);
246                 set_end_I(s->I, q, last);
247                 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
248                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
249                 ++s; ++q;
250                 for (i = 1; i != end; ++i, ++s, ++q) {
251                         set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
252                         set_I(s->I, q, last + i);
253                         set_D(s->D, q, s - 1);
254                 }
255                 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
256                 set_D(s->D, q, s - 1);
257                 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
258                         set_end_I(s->I, q, last + i);
259                 } else s->I = MINOR_INF;
260                 s = curr; curr = last; last = s;
261         }
262         /* last row for part 1, use set_end_D() instead of set_D() */
263         if (j == len2 && b2 != len2 - 1) {
264                 q = dpcell[j]; s = curr; SET_INF(*s);
265                 set_end_I(s->I, q, last);
266                 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
267                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
268                 ++s; ++q;
269                 for (i = 1; i != end; ++i, ++s, ++q) {
270                         set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
271                         set_I(s->I, q, last + i);
272                         set_end_D(s->D, q, s - 1);
273                 }
274                 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
275                 set_end_D(s->D, q, s - 1);
276                 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
277                         set_end_I(s->I, q, last + i);
278                 } else s->I = MINOR_INF;
279                 s = curr; curr = last; last = s;
280                 ++j;
281         }
282
283         /* core dynamic programming, part 2 */
284         for (; j <= len2 - b2 + 1; ++j) {
285                 SET_INF(curr[j - b2]);
286                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
287                 end = j + b1 - 1;
288                 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
289                         set_M(s->M, q, last + i - 1, mat[seq1[i]]);
290                         set_I(s->I, q, last + i);
291                         set_D(s->D, q, s - 1);
292                 }
293                 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
294                 set_D(s->D, q, s - 1);
295                 s->I = MINOR_INF;
296                 s = curr; curr = last; last = s;
297         }
298
299         /* core dynamic programming, part 3 */
300         for (; j < len2; ++j) {
301                 SET_INF(curr[j - b2]);
302                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
303                 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
304                         set_M(s->M, q, last + i - 1, mat[seq1[i]]);
305                         set_I(s->I, q, last + i);
306                         set_D(s->D, q, s - 1);
307                 }
308                 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
309                 set_end_I(s->I, q, last + i);
310                 set_D(s->D, q, s - 1);
311                 s = curr; curr = last; last = s;
312         }
313         /* last row */
314         if (j == len2) {
315                 SET_INF(curr[j - b2]);
316                 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
317                 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
318                         set_M(s->M, q, last + i - 1, mat[seq1[i]]);
319                         set_I(s->I, q, last + i);
320                         set_end_D(s->D, q, s - 1);
321                 }
322                 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
323                 set_end_I(s->I, q, last + i);
324                 set_end_D(s->D, q, s - 1);
325                 s = curr; curr = last; last = s;
326         }
327
328         *_score = last[len1].M;
329         if (n_cigar) { /* backtrace */
330                 path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
331                 i = len1; j = len2;
332                 q = dpcell[j] + i;
333                 s = last + len1;
334                 max = s->M; type = q->Mt; ctype = FROM_M;
335                 if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
336                 if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
337
338                 p = path;
339                 p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
340                 ++p;
341                 do {
342                         switch (ctype) {
343                         case FROM_M: --i; --j; break;
344                         case FROM_I: --j; break;
345                         case FROM_D: --i; break;
346                         }
347                         q = dpcell[j] + i;
348                         ctype = type;
349                         switch (type) {
350                         case FROM_M: type = q->Mt; break;
351                         case FROM_I: type = q->It; break;
352                         case FROM_D: type = q->Dt; break;
353                         }
354                         p->ctype = ctype; p->i = i; p->j = j;
355                         ++p;
356                 } while (i || j);
357                 cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
358                 free(path);
359         }
360
361         /* free memory */
362         for (j = b2 + 1; j <= len2; ++j)
363                 dpcell[j] += j - b2;
364         for (j = 0; j <= len2; ++j)
365                 free(dpcell[j]);
366         free(dpcell);
367         free(curr); free(last);
368
369         return cigar;
370 }