5 Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
7 Permission is hereby granted, free of charge, to any person obtaining
8 a copy of this software and associated documentation files (the
9 "Software"), to deal in the Software without restriction, including
10 without limitation the rights to use, copy, modify, merge, publish,
11 distribute, sublicense, and/or sell copies of the Software, and to
12 permit persons to whom the Software is furnished to do so, subject to
13 the following conditions:
15 The above copyright notice and this permission notice shall be
16 included in all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
22 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
23 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
44 int aln_sm_blosum62[] = {
45 /* A R N D C Q E G H I L K M F P S T W Y V * X */
46 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
47 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
48 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
49 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
50 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
51 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
52 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
53 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
54 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
55 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
56 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
57 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
58 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
59 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
60 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
61 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
62 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
63 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
64 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
65 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
66 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
67 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
70 int aln_sm_blast[] = {
86 ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 };
87 ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 };
89 ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 };
91 static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
95 unsigned char last_type;
97 if (path_len == 0 || path == 0) {
102 last_type = path->ctype;
103 for (i = n = 1; i < path_len; ++i) {
104 if (last_type != path[i].ctype) ++n;
105 last_type = path[i].ctype;
108 cigar = (uint32_t*)calloc(*n_cigar, 4);
110 cigar[0] = 1u << 4 | path[path_len-1].ctype;
111 last_type = path[path_len-1].ctype;
112 for (i = path_len - 2, n = 0; i >= 0; --i) {
113 if (path[i].ctype == last_type) cigar[n] += 1u << 4;
115 cigar[++n] = 1u << 4 | path[i].ctype;
116 last_type = path[i].ctype;
123 /***************************/
124 /* START OF common_align.c */
125 /***************************/
127 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
129 #define set_M(MM, cur, p, sc) \
131 if ((p)->M >= (p)->I) { \
132 if ((p)->M >= (p)->D) { \
133 (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
135 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
138 if ((p)->I > (p)->D) { \
139 (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
141 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
145 #define set_I(II, cur, p) \
147 if ((p)->M - gap_open > (p)->I) { \
148 (cur)->It = FROM_M; \
149 (II) = (p)->M - gap_open - gap_ext; \
151 (cur)->It = FROM_I; \
152 (II) = (p)->I - gap_ext; \
155 #define set_end_I(II, cur, p) \
157 if (gap_end_ext >= 0) { \
158 if ((p)->M - gap_end_open > (p)->I) { \
159 (cur)->It = FROM_M; \
160 (II) = (p)->M - gap_end_open - gap_end_ext; \
162 (cur)->It = FROM_I; \
163 (II) = (p)->I - gap_end_ext; \
165 } else set_I(II, cur, p); \
167 #define set_D(DD, cur, p) \
169 if ((p)->M - gap_open > (p)->D) { \
170 (cur)->Dt = FROM_M; \
171 (DD) = (p)->M - gap_open - gap_ext; \
173 (cur)->Dt = FROM_D; \
174 (DD) = (p)->D - gap_ext; \
177 #define set_end_D(DD, cur, p) \
179 if (gap_end_ext >= 0) { \
180 if ((p)->M - gap_end_open > (p)->D) { \
181 (cur)->Dt = FROM_M; \
182 (DD) = (p)->M - gap_end_open - gap_end_ext; \
184 (cur)->Dt = FROM_D; \
185 (DD) = (p)->D - gap_end_ext; \
187 } else set_D(DD, cur, p); \
191 uint8_t Mt:3, It:2, Dt:3;
198 /***************************
199 * banded global alignment *
200 ***************************/
201 uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
204 dpcell_t **dpcell, *q;
205 dpscore_t *curr, *last, *s;
207 int *mat, end, max = 0;
211 int gap_open, gap_ext, gap_end_open, gap_end_ext, b;
212 int *score_matrix, N_MATRIX_ROW;
214 /* initialize some align-related parameters. just for compatibility */
215 gap_open = ap->gap_open;
216 gap_ext = ap->gap_ext;
217 gap_end_open = ap->gap_end_open;
218 gap_end_ext = ap->gap_end_ext;
220 score_matrix = ap->matrix;
221 N_MATRIX_ROW = ap->row;
223 if (n_cigar) *n_cigar = 0;
224 if (len1 == 0 || len2 == 0) return 0;
226 /* calculate b1 and b2 */
228 b1 = len1 - len2 + b;
232 b2 = len2 - len1 + b;
234 if (b1 > len1) b1 = len1;
235 if (b2 > len2) b2 = len2;
238 /* allocate memory */
239 end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
240 dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
241 for (j = 0; j <= len2; ++j)
242 dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
243 for (j = b2 + 1; j <= len2; ++j)
245 curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
246 last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
249 SET_INF(*curr); curr->M = 0;
250 for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
252 set_end_D(s->D, dpcell[0] + i, s - 1);
254 s = curr; curr = last; last = s;
256 /* core dynamic programming, part 1 */
257 tmp_end = (b2 < len2)? b2 : len2 - 1;
258 for (j = 1; j <= tmp_end; ++j) {
259 q = dpcell[j]; s = curr; SET_INF(*s);
260 set_end_I(s->I, q, last);
261 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
262 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
264 for (i = 1; i != end; ++i, ++s, ++q) {
265 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
266 set_I(s->I, q, last + i);
267 set_D(s->D, q, s - 1);
269 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
270 set_D(s->D, q, s - 1);
271 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
272 set_end_I(s->I, q, last + i);
273 } else s->I = MINOR_INF;
274 s = curr; curr = last; last = s;
276 /* last row for part 1, use set_end_D() instead of set_D() */
277 if (j == len2 && b2 != len2 - 1) {
278 q = dpcell[j]; s = curr; SET_INF(*s);
279 set_end_I(s->I, q, last);
280 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
281 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
283 for (i = 1; i != end; ++i, ++s, ++q) {
284 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
285 set_I(s->I, q, last + i);
286 set_end_D(s->D, q, s - 1);
288 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
289 set_end_D(s->D, q, s - 1);
290 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
291 set_end_I(s->I, q, last + i);
292 } else s->I = MINOR_INF;
293 s = curr; curr = last; last = s;
297 /* core dynamic programming, part 2 */
298 for (; j <= len2 - b2 + 1; ++j) {
299 SET_INF(curr[j - b2]);
300 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
302 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
303 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
304 set_I(s->I, q, last + i);
305 set_D(s->D, q, s - 1);
307 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
308 set_D(s->D, q, s - 1);
310 s = curr; curr = last; last = s;
313 /* core dynamic programming, part 3 */
314 for (; j < len2; ++j) {
315 SET_INF(curr[j - b2]);
316 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
317 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
318 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
319 set_I(s->I, q, last + i);
320 set_D(s->D, q, s - 1);
322 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
323 set_end_I(s->I, q, last + i);
324 set_D(s->D, q, s - 1);
325 s = curr; curr = last; last = s;
329 SET_INF(curr[j - b2]);
330 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
331 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
332 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
333 set_I(s->I, q, last + i);
334 set_end_D(s->D, q, s - 1);
336 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
337 set_end_I(s->I, q, last + i);
338 set_end_D(s->D, q, s - 1);
339 s = curr; curr = last; last = s;
342 *_score = last[len1].M;
343 if (n_cigar) { /* backtrace */
344 path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
348 max = s->M; type = q->Mt; ctype = FROM_M;
349 if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
350 if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
353 p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
357 case FROM_M: --i; --j; break;
358 case FROM_I: --j; break;
359 case FROM_D: --i; break;
364 case FROM_M: type = q->Mt; break;
365 case FROM_I: type = q->It; break;
366 case FROM_D: type = q->Dt; break;
368 p->ctype = ctype; p->i = i; p->j = j;
371 cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
376 for (j = b2 + 1; j <= len2; ++j)
378 for (j = 0; j <= len2; ++j)
381 free(curr); free(last);
390 #define MINUS_INF -0x40000000
392 // matrix: len2 rows and len1 columns
393 int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap)
396 #define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \
400 _p->M = _q->M >= _q->I? _q->M : _q->I; \
401 _p->M = _p->M >= _q->D? _p->M : _q->D; \
403 ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \
404 _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \
407 int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret;
408 const uint8_t *seq1, *seq2;
409 score_aux_t *curr, *last, *swap;
410 bw = abs(len1 - len2) + ap->band_width;
411 i = len1 > len2? len1 : len2;
412 if (bw > i + 1) bw = i + 1;
413 seq1 = _seq1 - 1; seq2 = _seq2 - 1;
414 curr = calloc(len1 + 2, sizeof(score_aux_t));
415 last = calloc(len1 + 2, sizeof(score_aux_t));
420 x = j + bw; end = len1 < x? len1 : x; // band end
422 p->M = 0; p->I = p->D = MINUS_INF;
423 for (i = 1, p = &curr[1]; i <= end; ++i, ++p)
424 p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i);
425 p->M = p->I = p->D = MINUS_INF;
426 swap = curr; curr = last; last = swap;
428 for (j = 1; j < len2; ++j) {
429 int x, beg = 0, end = len1, *scrow, col_end;
431 x = j - bw; beg = 0 > x? 0 : x; // band start
432 x = j + bw; end = len1 < x? len1 : x; // band end
433 if (beg == 0) { // from zero-th column
435 p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
436 ++beg; // then beg = 1
438 scrow = scmat + seq2[j] * scmat_size;
439 if (end == len1) col_end = 1, --end;
441 for (i = beg, p = &curr[beg]; i <= end; ++i, ++p)
442 __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide);
444 __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide);
447 p->M = p->I = p->D = MINUS_INF;
448 // for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
449 swap = curr; curr = last; last = swap;
452 int x, beg = 0, *scrow;
455 x = j - bw; beg = 0 > x? 0 : x; // band start
456 if (beg == 0) { // from zero-th column
458 p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
459 ++beg; // then beg = 1
461 scrow = scmat + seq2[j] * scmat_size;
462 for (i = beg, p = &curr[beg]; i < len1; ++i, ++p)
463 __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede);
464 __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede);
465 // for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
467 ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I;
468 ret = ret >= curr[len1].D? ret : curr[len1].D;
469 free(curr); free(last);
474 int main(int argc, char *argv[])
476 // int len1 = 35, len2 = 35;
477 // uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1";
478 // uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0";
479 int len1 = 4, len2 = 4;
480 uint8_t *seq1 = (uint8_t*)"\1\0\0\1";
481 uint8_t *seq2 = (uint8_t*)"\1\0\1\0";
483 // ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0);
484 sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual);