Imported Upstream version 0.1.7~dfsg
[samtools.git] / kaln.c
diff --git a/kaln.c b/kaln.c
new file mode 100644 (file)
index 0000000..9fa40d0
--- /dev/null
+++ b/kaln.c
@@ -0,0 +1,370 @@
+/* The MIT License
+
+   Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include "kaln.h"
+
+#define FROM_M 0
+#define FROM_I 1
+#define FROM_D 2
+
+typedef struct {
+       int i, j;
+       unsigned char ctype;
+} path_t;
+
+int aln_sm_blosum62[] = {
+/*      A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  *  X */
+        4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
+       -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
+       -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
+       -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
+        0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
+       -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
+       -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
+        0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
+       -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
+       -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
+       -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
+       -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
+       -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
+       -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
+       -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
+        1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
+        0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
+       -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
+       -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
+        0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
+       -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
+        0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
+};
+
+int aln_sm_blast[] = {
+       1, -3, -3, -3, -2,
+       -3, 1, -3, -3, -2,
+       -3, -3, 1, -3, -2,
+       -3, -3, -3, 1, -2,
+       -2, -2, -2, -2, -2
+};
+
+ka_param_t ka_param_blast = {  5,  2,  2, aln_sm_blast, 5, 50 };
+ka_param_t ka_param_aa2aa = { 10,  2,  2, aln_sm_blosum62, 22, 50 };
+
+static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
+{
+       int i, n;
+       uint32_t *cigar;
+       unsigned char last_type;
+
+       if (path_len == 0 || path == 0) {
+               *n_cigar = 0;
+               return 0;
+       }
+
+       last_type = path->ctype;
+       for (i = n = 1; i < path_len; ++i) {
+               if (last_type != path[i].ctype) ++n;
+               last_type = path[i].ctype;
+       }
+       *n_cigar = n;
+       cigar = (uint32_t*)calloc(*n_cigar, 4);
+
+       cigar[0] = 1u << 4 | path[path_len-1].ctype;
+       last_type = path[path_len-1].ctype;
+       for (i = path_len - 2, n = 0; i >= 0; --i) {
+               if (path[i].ctype == last_type) cigar[n] += 1u << 4;
+               else {
+                       cigar[++n] = 1u << 4 | path[i].ctype;
+                       last_type = path[i].ctype;
+               }
+       }
+
+       return cigar;
+}
+
+/***************************/
+/* START OF common_align.c */
+/***************************/
+
+#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
+
+#define set_M(MM, cur, p, sc)                                                  \
+{                                                                                                              \
+       if ((p)->M >= (p)->I) {                                                         \
+               if ((p)->M >= (p)->D) {                                                 \
+                       (MM) = (p)->M + (sc); (cur)->Mt = FROM_M;       \
+               } else {                                                                                \
+                       (MM) = (p)->D + (sc); (cur)->Mt = FROM_D;       \
+               }                                                                                               \
+       } else {                                                                                        \
+               if ((p)->I > (p)->D) {                                                  \
+                       (MM) = (p)->I + (sc); (cur)->Mt = FROM_I;       \
+               } else {                                                                                \
+                       (MM) = (p)->D + (sc); (cur)->Mt = FROM_D;       \
+               }                                                                                               \
+       }                                                                                                       \
+}
+#define set_I(II, cur, p)                                                              \
+{                                                                                                              \
+       if ((p)->M - gap_open > (p)->I) {                                       \
+               (cur)->It = FROM_M;                                                             \
+               (II) = (p)->M - gap_open - gap_ext;                             \
+       } else {                                                                                        \
+               (cur)->It = FROM_I;                                                             \
+               (II) = (p)->I - gap_ext;                                                \
+       }                                                                                                       \
+}
+#define set_end_I(II, cur, p)                                                  \
+{                                                                                                              \
+       if (gap_end >= 0) {                                                                     \
+               if ((p)->M - gap_open > (p)->I) {                               \
+                       (cur)->It = FROM_M;                                                     \
+                       (II) = (p)->M - gap_open - gap_end;                     \
+               } else {                                                                                \
+                       (cur)->It = FROM_I;                                                     \
+                       (II) = (p)->I - gap_end;                                        \
+               }                                                                                               \
+       } else set_I(II, cur, p);                                                       \
+}
+#define set_D(DD, cur, p)                                                              \
+{                                                                                                              \
+       if ((p)->M - gap_open > (p)->D) {                                       \
+               (cur)->Dt = FROM_M;                                                             \
+               (DD) = (p)->M - gap_open - gap_ext;                             \
+       } else {                                                                                        \
+               (cur)->Dt = FROM_D;                                                             \
+               (DD) = (p)->D - gap_ext;                                                \
+       }                                                                                                       \
+}
+#define set_end_D(DD, cur, p)                                                  \
+{                                                                                                              \
+       if (gap_end >= 0) {                                                                     \
+               if ((p)->M - gap_open > (p)->D) {                               \
+                       (cur)->Dt = FROM_M;                                                     \
+                       (DD) = (p)->M - gap_open - gap_end;                     \
+               } else {                                                                                \
+                       (cur)->Dt = FROM_D;                                                     \
+                       (DD) = (p)->D - gap_end;                                        \
+               }                                                                                               \
+       } else set_D(DD, cur, p);                                                       \
+}
+
+typedef struct {
+       uint8_t Mt:3, It:2, Dt:2;
+} dpcell_t;
+
+typedef struct {
+       int M, I, D;
+} dpscore_t;
+
+/***************************
+ * banded global alignment *
+ ***************************/
+uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
+{
+       int i, j;
+       dpcell_t **dpcell, *q;
+       dpscore_t *curr, *last, *s;
+       int b1, b2, tmp_end;
+       int *mat, end, max = 0;
+       uint8_t type, ctype;
+       uint32_t *cigar = 0;
+
+       int gap_open, gap_ext, gap_end, b;
+       int *score_matrix, N_MATRIX_ROW;
+
+       /* initialize some align-related parameters. just for compatibility */
+       gap_open = ap->gap_open;
+       gap_ext = ap->gap_ext;
+       gap_end = ap->gap_end;
+       b = ap->band_width;
+       score_matrix = ap->matrix;
+       N_MATRIX_ROW = ap->row;
+
+       *n_cigar = 0;
+       if (len1 == 0 || len2 == 0) return 0;
+
+       /* calculate b1 and b2 */
+       if (len1 > len2) {
+               b1 = len1 - len2 + b;
+               b2 = b;
+       } else {
+               b1 = b;
+               b2 = len2 - len1 + b;
+       }
+       if (b1 > len1) b1 = len1;
+       if (b2 > len2) b2 = len2;
+       --seq1; --seq2;
+
+       /* allocate memory */
+       end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
+       dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
+       for (j = 0; j <= len2; ++j)
+               dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
+       for (j = b2 + 1; j <= len2; ++j)
+               dpcell[j] -= j - b2;
+       curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
+       last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
+       
+       /* set first row */
+       SET_INF(*curr); curr->M = 0;
+       for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
+               SET_INF(*s);
+               set_end_D(s->D, dpcell[0] + i, s - 1);
+       }
+       s = curr; curr = last; last = s;
+
+       /* core dynamic programming, part 1 */
+       tmp_end = (b2 < len2)? b2 : len2 - 1;
+       for (j = 1; j <= tmp_end; ++j) {
+               q = dpcell[j]; s = curr; SET_INF(*s);
+               set_end_I(s->I, q, last);
+               end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               ++s; ++q;
+               for (i = 1; i != end; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
+                       set_I(s->I, q, last + i);
+                       set_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+               set_D(s->D, q, s - 1);
+               if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
+                       set_end_I(s->I, q, last + i);
+               } else s->I = MINOR_INF;
+               s = curr; curr = last; last = s;
+       }
+       /* last row for part 1, use set_end_D() instead of set_D() */
+       if (j == len2 && b2 != len2 - 1) {
+               q = dpcell[j]; s = curr; SET_INF(*s);
+               set_end_I(s->I, q, last);
+               end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               ++s; ++q;
+               for (i = 1; i != end; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
+                       set_I(s->I, q, last + i);
+                       set_end_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+               set_end_D(s->D, q, s - 1);
+               if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
+                       set_end_I(s->I, q, last + i);
+               } else s->I = MINOR_INF;
+               s = curr; curr = last; last = s;
+               ++j;
+       }
+
+       /* core dynamic programming, part 2 */
+       for (; j <= len2 - b2 + 1; ++j) {
+               SET_INF(curr[j - b2]);
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               end = j + b1 - 1;
+               for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+                       set_I(s->I, q, last + i);
+                       set_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+               set_D(s->D, q, s - 1);
+               s->I = MINOR_INF;
+               s = curr; curr = last; last = s;
+       }
+
+       /* core dynamic programming, part 3 */
+       for (; j < len2; ++j) {
+               SET_INF(curr[j - b2]);
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+                       set_I(s->I, q, last + i);
+                       set_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
+               set_end_I(s->I, q, last + i);
+               set_D(s->D, q, s - 1);
+               s = curr; curr = last; last = s;
+       }
+       /* last row */
+       if (j == len2) {
+               SET_INF(curr[j - b2]);
+               mat = score_matrix + seq2[j] * N_MATRIX_ROW;
+               for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
+                       set_M(s->M, q, last + i - 1, mat[seq1[i]]);
+                       set_I(s->I, q, last + i);
+                       set_end_D(s->D, q, s - 1);
+               }
+               set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
+               set_end_I(s->I, q, last + i);
+               set_end_D(s->D, q, s - 1);
+               s = curr; curr = last; last = s;
+       }
+
+       *_score = last[len1].M;
+       if (n_cigar) { /* backtrace */
+               path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
+               i = len1; j = len2;
+               q = dpcell[j] + i;
+               s = last + len1;
+               max = s->M; type = q->Mt; ctype = FROM_M;
+               if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
+               if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
+
+               p = path;
+               p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
+               ++p;
+               do {
+                       switch (ctype) {
+                       case FROM_M: --i; --j; break;
+                       case FROM_I: --j; break;
+                       case FROM_D: --i; break;
+                       }
+                       q = dpcell[j] + i;
+                       ctype = type;
+                       switch (type) {
+                       case FROM_M: type = q->Mt; break;
+                       case FROM_I: type = q->It; break;
+                       case FROM_D: type = q->Dt; break;
+                       }
+                       p->ctype = ctype; p->i = i; p->j = j;
+                       ++p;
+               } while (i || j);
+               cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
+               free(path);
+       }
+
+       /* free memory */
+       for (j = b2 + 1; j <= len2; ++j)
+               dpcell[j] += j - b2;
+       for (j = 0; j <= len2; ++j)
+               free(dpcell[j]);
+       free(dpcell);
+       free(curr); free(last);
+
+       return cigar;
+}