X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=blobdiff_plain;f=bcftools%2Fbcfutils.c;h=0eab4c1f322b398750fdda9da4caec6eea8e7579;hp=ae7ec0f0256f2bff77ad12d31f8c05ece2f518a0;hb=0f906dafb2ad22371a753557562ef95c3034480d;hpb=c34624801b980425af68c3c431423c72b18c14fe diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c index ae7ec0f..0eab4c1 100644 --- a/bcftools/bcfutils.c +++ b/bcftools/bcfutils.c @@ -5,6 +5,12 @@ #include "khash.h" KHASH_MAP_INIT_STR(str2id, int) +#ifdef _WIN32 +#define srand48(x) srand(x) +#define drand48() ((double)rand() / RAND_MAX) +#endif + +// FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated... void *bcf_build_refhash(bcf_hdr_t *h) { khash_t(str2id) *hash; @@ -141,6 +147,54 @@ int bcf_fix_gt(bcf1_t *b) return 0; } +int bcf_fix_pl(bcf1_t *b) +{ + int i; + uint32_t tmp; + uint8_t *PL, *swap; + bcf_ginfo_t *gi; + // pinpoint PL + tmp = bcf_str2int("PL", 2); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; + // prepare + gi = b->gi + i; + PL = (uint8_t*)gi->data; + swap = alloca(gi->len); + // loop through individuals + for (i = 0; i < b->n_smpl; ++i) { + int k, l, x; + uint8_t *PLi = PL + i * gi->len; + memcpy(swap, PLi, gi->len); + for (k = x = 0; k < b->n_alleles; ++k) + for (l = k; l < b->n_alleles; ++l) + PLi[l*(l+1)/2 + k] = swap[x++]; + } + return 0; +} + +int bcf_smpl_covered(const bcf1_t *b) +{ + int i, j, n = 0; + uint32_t tmp; + bcf_ginfo_t *gi; + // pinpoint PL + tmp = bcf_str2int("PL", 2); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; + // count how many samples having PL!=[0..0] + gi = b->gi + i; + for (i = 0; i < b->n_smpl; ++i) { + uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len; + for (j = 0; j < gi->len; ++j) + if (PLi[j]) break; + if (j < gi->len) ++n; + } + return n; +} + static void *locate_field(const bcf1_t *b, const char *fmt, int l) { int i; @@ -188,6 +242,31 @@ int bcf_anno_max(bcf1_t *b) return 0; } +// FIXME: only data are shuffled; the header is NOT +int bcf_shuffle(bcf1_t *b, int seed) +{ + int i, j, *a; + if (seed > 0) srand48(seed); + a = malloc(b->n_smpl * sizeof(int)); + for (i = 0; i < b->n_smpl; ++i) a[i] = i; + for (i = b->n_smpl; i > 1; --i) { + int tmp; + j = (int)(drand48() * i); + tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; + } + for (j = 0; j < b->n_gi; ++j) { + bcf_ginfo_t *gi = b->gi + j; + uint8_t *swap, *data = (uint8_t*)gi->data; + swap = malloc(gi->len * b->n_smpl); + for (i = 0; i < b->n_smpl; ++i) + memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len); + free(gi->data); + gi->data = swap; + } + free(a); + return 0; +} + bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list) { int i, ret, j; @@ -197,12 +276,15 @@ bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int kstring_t s; s.l = s.m = 0; s.s = 0; hash = kh_init(str2id); - for (i = 0; i < n; ++i) - k = kh_put(str2id, hash, samples[i], &ret); - for (i = j = 0; i < h0->n_smpl; ++i) { - if (kh_get(str2id, hash, h0->sns[i]) != kh_end(hash)) { - list[j++] = i; - kputs(h0->sns[i], &s); kputc('\0', &s); + for (i = 0; i < h0->n_smpl; ++i) { + k = kh_put(str2id, hash, h0->sns[i], &ret); + kh_val(hash, k) = i; + } + for (i = j = 0; i < n; ++i) { + k = kh_get(str2id, hash, samples[i]); + if (k != kh_end(hash)) { + list[j++] = kh_val(hash, k); + kputs(samples[i], &s); kputc('\0', &s); } } if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); @@ -217,14 +299,92 @@ bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int return h; } -int bcf_subsam(int n_smpl, int *list, bcf1_t *b) // list MUST BE sorted +int bcf_subsam(int n_smpl, int *list, bcf1_t *b) { int i, j; for (j = 0; j < b->n_gi; ++j) { bcf_ginfo_t *gi = b->gi + j; + uint8_t *swap; + swap = malloc(gi->len * b->n_smpl); for (i = 0; i < n_smpl; ++i) - memcpy((uint8_t*)gi->data + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); + memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); + free(gi->data); + gi->data = swap; } b->n_smpl = n_smpl; return 0; } + +static int8_t nt4_table[128] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4 +}; + +int bcf_gl10(const bcf1_t *b, uint8_t *gl) +{ + int a[4], k, l, map[4], k1, j, i; + const bcf_ginfo_t *PL; + char *s; + if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + PL = b->gi + i; + a[0] = nt4_table[(int)b->ref[0]]; + if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T + a[1] = a[2] = a[3] = -2; // -1 has a special meaning + if (b->alt[0] == 0) return -1; // no alternate allele + map[0] = map[1] = map[2] = map[3] = -2; + map[a[0]] = 0; + for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) { + if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base + a[k+1] = nt4_table[(int)*s]; + if (a[k+1] >= 0) map[a[k+1]] = k+1; + else k1 = k + 1; + if (s[1] == 0) break; // the end of the ALT string + } + for (k = 0; k < 4; ++k) + if (map[k] < 0) map[k] = k1; + for (i = 0; i < b->n_smpl; ++i) { + const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual + uint8_t *g = gl + 10 * i; + for (k = j = 0; k < 4; ++k) { + for (l = k; l < 4; ++l) { + int t, x = map[k], y = map[l]; + if (x > y) t = x, x = y, y = t; // make sure x is the smaller + g[j++] = p[y * (y+1) / 2 + x]; + } + } + } + return 0; +} + +int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl) +{ + int k, l, j, i; + const bcf_ginfo_t *PL; + if (b->alt[0] == 0) return -1; // no alternate allele + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + PL = b->gi + i; + for (i = 0; i < b->n_smpl; ++i) { + const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual + uint8_t *g = gl + 10 * i; + for (k = j = 0; k < 4; ++k) { + for (l = k; l < 4; ++l) { + int t, x = k, y = l; + if (x > y) t = x, x = y, y = t; // make sure x is the smaller + x = y * (y+1) / 2 + x; + g[j++] = x < PL->len? p[x] : 255; + } + } + } + return 0; +}