X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=pysam.git;a=blobdiff_plain;f=samtools%2Fkaln.c.pysam.c;fp=samtools%2Fkaln.c.pysam.c;h=953a214ec9041545949dc33743b66eed59d84609;hp=0000000000000000000000000000000000000000;hb=d02fe5283ed7a93a2f76a5d6dc6e37b40c11b9b1;hpb=d828f9c9aa78e3d1687265b52de841f3f3852089 diff --git a/samtools/kaln.c.pysam.c b/samtools/kaln.c.pysam.c new file mode 100644 index 0000000..953a214 --- /dev/null +++ b/samtools/kaln.c.pysam.c @@ -0,0 +1,488 @@ +#include "pysam.h" + +/* The MIT License + + Copyright (c) 2003-2006, 2008, 2009, by Heng Li + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include "kaln.h" + +#define FROM_M 0 +#define FROM_I 1 +#define FROM_D 2 + +typedef struct { + int i, j; + unsigned char ctype; +} path_t; + +int aln_sm_blosum62[] = { +/* A R N D C Q E G H I L K M F P S T W Y V * X */ + 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0, + -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1, + -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1, + -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1, + 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2, + -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1, + -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1, + 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1, + -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1, + -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1, + -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1, + -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1, + -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1, + -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1, + -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2, + 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0, + 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0, + -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2, + -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1, + 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1, + -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4, + 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 +}; + +int aln_sm_blast[] = { + 1, -3, -3, -3, -2, + -3, 1, -3, -3, -2, + -3, -3, 1, -3, -2, + -3, -3, -3, 1, -2, + -2, -2, -2, -2, -2 +}; + +int aln_sm_qual[] = { + 0, -23, -23, -23, 0, + -23, 0, -23, -23, 0, + -23, -23, 0, -23, 0, + -23, -23, -23, 0, 0, + 0, 0, 0, 0, 0 +}; + +ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; +ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 }; + +ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 }; + +static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) +{ + int i, n; + uint32_t *cigar; + unsigned char last_type; + + if (path_len == 0 || path == 0) { + *n_cigar = 0; + return 0; + } + + last_type = path->ctype; + for (i = n = 1; i < path_len; ++i) { + if (last_type != path[i].ctype) ++n; + last_type = path[i].ctype; + } + *n_cigar = n; + cigar = (uint32_t*)calloc(*n_cigar, 4); + + cigar[0] = 1u << 4 | path[path_len-1].ctype; + last_type = path[path_len-1].ctype; + for (i = path_len - 2, n = 0; i >= 0; --i) { + if (path[i].ctype == last_type) cigar[n] += 1u << 4; + else { + cigar[++n] = 1u << 4 | path[i].ctype; + last_type = path[i].ctype; + } + } + + return cigar; +} + +/***************************/ +/* START OF common_align.c */ +/***************************/ + +#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF; + +#define set_M(MM, cur, p, sc) \ +{ \ + if ((p)->M >= (p)->I) { \ + if ((p)->M >= (p)->D) { \ + (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \ + } else { \ + (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ + } \ + } else { \ + if ((p)->I > (p)->D) { \ + (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \ + } else { \ + (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ + } \ + } \ +} +#define set_I(II, cur, p) \ +{ \ + if ((p)->M - gap_open > (p)->I) { \ + (cur)->It = FROM_M; \ + (II) = (p)->M - gap_open - gap_ext; \ + } else { \ + (cur)->It = FROM_I; \ + (II) = (p)->I - gap_ext; \ + } \ +} +#define set_end_I(II, cur, p) \ +{ \ + if (gap_end_ext >= 0) { \ + if ((p)->M - gap_end_open > (p)->I) { \ + (cur)->It = FROM_M; \ + (II) = (p)->M - gap_end_open - gap_end_ext; \ + } else { \ + (cur)->It = FROM_I; \ + (II) = (p)->I - gap_end_ext; \ + } \ + } else set_I(II, cur, p); \ +} +#define set_D(DD, cur, p) \ +{ \ + if ((p)->M - gap_open > (p)->D) { \ + (cur)->Dt = FROM_M; \ + (DD) = (p)->M - gap_open - gap_ext; \ + } else { \ + (cur)->Dt = FROM_D; \ + (DD) = (p)->D - gap_ext; \ + } \ +} +#define set_end_D(DD, cur, p) \ +{ \ + if (gap_end_ext >= 0) { \ + if ((p)->M - gap_end_open > (p)->D) { \ + (cur)->Dt = FROM_M; \ + (DD) = (p)->M - gap_end_open - gap_end_ext; \ + } else { \ + (cur)->Dt = FROM_D; \ + (DD) = (p)->D - gap_end_ext; \ + } \ + } else set_D(DD, cur, p); \ +} + +typedef struct { + uint8_t Mt:3, It:2, Dt:3; +} dpcell_t; + +typedef struct { + int M, I, D; +} dpscore_t; + +/*************************** + * banded global alignment * + ***************************/ +uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar) +{ + int i, j; + dpcell_t **dpcell, *q; + dpscore_t *curr, *last, *s; + int b1, b2, tmp_end; + int *mat, end, max = 0; + uint8_t type, ctype; + uint32_t *cigar = 0; + + int gap_open, gap_ext, gap_end_open, gap_end_ext, b; + int *score_matrix, N_MATRIX_ROW; + + /* initialize some align-related parameters. just for compatibility */ + gap_open = ap->gap_open; + gap_ext = ap->gap_ext; + gap_end_open = ap->gap_end_open; + gap_end_ext = ap->gap_end_ext; + b = ap->band_width; + score_matrix = ap->matrix; + N_MATRIX_ROW = ap->row; + + if (n_cigar) *n_cigar = 0; + if (len1 == 0 || len2 == 0) return 0; + + /* calculate b1 and b2 */ + if (len1 > len2) { + b1 = len1 - len2 + b; + b2 = b; + } else { + b1 = b; + b2 = len2 - len1 + b; + } + if (b1 > len1) b1 = len1; + if (b2 > len2) b2 = len2; + --seq1; --seq2; + + /* allocate memory */ + end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); + dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1)); + for (j = 0; j <= len2; ++j) + dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end); + for (j = b2 + 1; j <= len2; ++j) + dpcell[j] -= j - b2; + curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); + last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); + + /* set first row */ + SET_INF(*curr); curr->M = 0; + for (i = 1, s = curr + 1; i < b1; ++i, ++s) { + SET_INF(*s); + set_end_D(s->D, dpcell[0] + i, s - 1); + } + s = curr; curr = last; last = s; + + /* core dynamic programming, part 1 */ + tmp_end = (b2 < len2)? b2 : len2 - 1; + for (j = 1; j <= tmp_end; ++j) { + q = dpcell[j]; s = curr; SET_INF(*s); + set_end_I(s->I, q, last); + end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + ++s; ++q; + for (i = 1; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_D(s->D, q, s - 1); + if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ + set_end_I(s->I, q, last + i); + } else s->I = MINOR_INF; + s = curr; curr = last; last = s; + } + /* last row for part 1, use set_end_D() instead of set_D() */ + if (j == len2 && b2 != len2 - 1) { + q = dpcell[j]; s = curr; SET_INF(*s); + set_end_I(s->I, q, last); + end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + ++s; ++q; + for (i = 1; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ + set_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_end_D(s->D, q, s - 1); + if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ + set_end_I(s->I, q, last + i); + } else s->I = MINOR_INF; + s = curr; curr = last; last = s; + ++j; + } + + /* core dynamic programming, part 2 */ + for (; j <= len2 - b2 + 1; ++j) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + end = j + b1 - 1; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_D(s->D, q, s - 1); + s->I = MINOR_INF; + s = curr; curr = last; last = s; + } + + /* core dynamic programming, part 3 */ + for (; j < len2; ++j) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); + set_end_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + s = curr; curr = last; last = s; + } + /* last row */ + if (j == len2) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + } + set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); + set_end_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + s = curr; curr = last; last = s; + } + + *_score = last[len1].M; + if (n_cigar) { /* backtrace */ + path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2)); + i = len1; j = len2; + q = dpcell[j] + i; + s = last + len1; + max = s->M; type = q->Mt; ctype = FROM_M; + if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; } + if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; } + + p = path; + p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */ + ++p; + do { + switch (ctype) { + case FROM_M: --i; --j; break; + case FROM_I: --j; break; + case FROM_D: --i; break; + } + q = dpcell[j] + i; + ctype = type; + switch (type) { + case FROM_M: type = q->Mt; break; + case FROM_I: type = q->It; break; + case FROM_D: type = q->Dt; break; + } + p->ctype = ctype; p->i = i; p->j = j; + ++p; + } while (i || j); + cigar = ka_path2cigar32(path, p - path - 1, n_cigar); + free(path); + } + + /* free memory */ + for (j = b2 + 1; j <= len2; ++j) + dpcell[j] += j - b2; + for (j = 0; j <= len2; ++j) + free(dpcell[j]); + free(dpcell); + free(curr); free(last); + + return cigar; +} + +typedef struct { + int M, I, D; +} score_aux_t; + +#define MINUS_INF -0x40000000 + +// matrix: len2 rows and len1 columns +int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap) +{ + +#define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \ + int t1, t2; \ + score_aux_t *_q; \ + _q = _q0; \ + _p->M = _q->M >= _q->I? _q->M : _q->I; \ + _p->M = _p->M >= _q->D? _p->M : _q->D; \ + _p->M += (_sc); \ + ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \ + _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \ + } + + int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret; + const uint8_t *seq1, *seq2; + score_aux_t *curr, *last, *swap; + bw = abs(len1 - len2) + ap->band_width; + i = len1 > len2? len1 : len2; + if (bw > i + 1) bw = i + 1; + seq1 = _seq1 - 1; seq2 = _seq2 - 1; + curr = calloc(len1 + 2, sizeof(score_aux_t)); + last = calloc(len1 + 2, sizeof(score_aux_t)); + { // the zero-th row + int x, end = len1; + score_aux_t *p; + j = 0; + x = j + bw; end = len1 < x? len1 : x; // band end + p = curr; + p->M = 0; p->I = p->D = MINUS_INF; + for (i = 1, p = &curr[1]; i <= end; ++i, ++p) + p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i); + p->M = p->I = p->D = MINUS_INF; + swap = curr; curr = last; last = swap; + } + for (j = 1; j < len2; ++j) { + int x, beg = 0, end = len1, *scrow, col_end; + score_aux_t *p; + x = j - bw; beg = 0 > x? 0 : x; // band start + x = j + bw; end = len1 < x? len1 : x; // band end + if (beg == 0) { // from zero-th column + p = curr; + p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j); + ++beg; // then beg = 1 + } + scrow = scmat + seq2[j] * scmat_size; + if (end == len1) col_end = 1, --end; + else col_end = 0; + for (i = beg, p = &curr[beg]; i <= end; ++i, ++p) + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide); + if (col_end) { + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide); + ++p; + } + p->M = p->I = p->D = MINUS_INF; +// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n'); + swap = curr; curr = last; last = swap; + } + { // the last row + int x, beg = 0, *scrow; + score_aux_t *p; + j = len2; + x = j - bw; beg = 0 > x? 0 : x; // band start + if (beg == 0) { // from zero-th column + p = curr; + p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j); + ++beg; // then beg = 1 + } + scrow = scmat + seq2[j] * scmat_size; + for (i = beg, p = &curr[beg]; i < len1; ++i, ++p) + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede); + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede); +// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n'); + } + ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I; + ret = ret >= curr[len1].D? ret : curr[len1].D; + free(curr); free(last); + return ret; +} + +#ifdef _MAIN +int main(int argc, char *argv[]) +{ +// int len1 = 35, len2 = 35; +// uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1"; +// uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0"; + int len1 = 4, len2 = 4; + uint8_t *seq1 = (uint8_t*)"\1\0\0\1"; + uint8_t *seq2 = (uint8_t*)"\1\0\1\0"; + int sc; +// ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0); + sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual); + printf("%d\n", sc); + return 0; +} +#endif