8 KHASH_MAP_INIT_STR(str2id, int)
11 #define srand48(x) srand(x)
12 #define drand48() ((double)rand() / RAND_MAX)
15 // FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated...
16 void *bcf_build_refhash(bcf_hdr_t *h)
18 khash_t(str2id) *hash;
20 hash = kh_init(str2id);
21 for (i = 0; i < h->n_ref; ++i) {
23 k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret
29 void *bcf_str2id_init()
31 return kh_init(str2id);
34 void bcf_str2id_destroy(void *_hash)
36 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
37 if (hash) kh_destroy(str2id, hash); // Note that strings are not freed.
40 void bcf_str2id_thorough_destroy(void *_hash)
42 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
44 if (hash == 0) return;
45 for (k = 0; k < kh_end(hash); ++k)
46 if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
47 kh_destroy(str2id, hash);
50 int bcf_str2id(void *_hash, const char *str)
52 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
55 k = kh_get(str2id, hash, str);
56 return k == kh_end(hash)? -1 : kh_val(hash, k);
59 int bcf_str2id_add(void *_hash, const char *str)
63 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
65 k = kh_put(str2id, hash, str, &ret);
66 if (ret == 0) return kh_val(hash, k);
67 kh_val(hash, k) = kh_size(hash) - 1;
68 return kh_val(hash, k);
71 int bcf_shrink_alt(bcf1_t *b, int n)
74 int i, j, k, n_smpl = b->n_smpl;
75 if (b->n_alleles <= n) return -1;
78 for (p = b->alt, k = 1; *p; ++p)
79 if (*p == ',' && ++k == n) break;
81 } else p = b->alt, *p = '\0';
83 memmove(p, b->flt, b->str + b->l_str - b->flt);
84 b->l_str -= b->flt - p;
86 for (i = 0; i < b->n_gi; ++i) {
87 bcf_ginfo_t *g = b->gi + i;
88 if (g->fmt == bcf_str2int("PL", 2)) {
89 int l, x = b->n_alleles * (b->n_alleles + 1) / 2;
90 uint8_t *d = (uint8_t*)g->data;
91 g->len = n * (n + 1) / 2;
92 for (l = k = 0; l < n_smpl; ++l) {
93 uint8_t *dl = d + l * x;
94 for (j = 0; j < g->len; ++j) d[k++] = dl[j];
103 int bcf_gl2pl(bcf1_t *b)
106 int i, n_smpl = b->n_smpl;
110 if (strstr(b->fmt, "PL")) return -1;
111 if ((p = strstr(b->fmt, "GL")) == 0) return -1;
113 for (i = 0; i < b->n_gi; ++i)
114 if (b->gi[i].fmt == bcf_str2int("GL", 2))
117 g->fmt = bcf_str2int("PL", 2);
118 g->len /= 4; // 4 == sizeof(float)
119 d0 = (float*)g->data; d1 = (uint8_t*)g->data;
120 for (i = 0; i < n_smpl * g->len; ++i) {
121 int x = (int)(-10. * d0[i] + .499);
122 if (x > 255) x = 255;
128 /* FIXME: this function will fail given AB:GTX:GT. BCFtools never
129 * produces such FMT, but others may do. */
130 int bcf_fix_gt(bcf1_t *b)
136 // check the presence of the GT FMT
137 if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first
138 if (s[3] != '\0' && s[3] != ':') return 0; // :GTX in fact
139 tmp = bcf_str2int("GT", 2);
140 for (i = 0; i < b->n_gi; ++i)
141 if (b->gi[i].fmt == tmp) break;
142 if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug...
144 // move GT to the first
145 for (; i > 0; --i) b->gi[i] = b->gi[i-1];
147 memmove(b->fmt + 3, b->fmt, s + 1 - b->fmt);
148 b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':';
152 int bcf_fix_pl(bcf1_t *b)
159 tmp = bcf_str2int("PL", 2);
160 for (i = 0; i < b->n_gi; ++i)
161 if (b->gi[i].fmt == tmp) break;
162 if (i == b->n_gi) return 0;
165 PL = (uint8_t*)gi->data;
166 swap = alloca(gi->len);
167 // loop through individuals
168 for (i = 0; i < b->n_smpl; ++i) {
170 uint8_t *PLi = PL + i * gi->len;
171 memcpy(swap, PLi, gi->len);
172 for (k = x = 0; k < b->n_alleles; ++k)
173 for (l = k; l < b->n_alleles; ++l)
174 PLi[l*(l+1)/2 + k] = swap[x++];
179 int bcf_smpl_covered(const bcf1_t *b)
185 tmp = bcf_str2int("PL", 2);
186 for (i = 0; i < b->n_gi; ++i)
187 if (b->gi[i].fmt == tmp) break;
188 if (i == b->n_gi) return 0;
189 // count how many samples having PL!=[0..0]
191 for (i = 0; i < b->n_smpl; ++i) {
192 uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len;
193 for (j = 0; j < gi->len; ++j)
195 if (j < gi->len) ++n;
200 static void *locate_field(const bcf1_t *b, const char *fmt, int l)
204 tmp = bcf_str2int(fmt, l);
205 for (i = 0; i < b->n_gi; ++i)
206 if (b->gi[i].fmt == tmp) break;
207 return i == b->n_gi? 0 : b->gi[i].data;
210 int bcf_anno_max(bcf1_t *b)
212 int k, max_gq, max_sp, n_het;
216 max_gq = max_sp = n_het = 0;
217 gt = locate_field(b, "GT", 2);
218 if (gt == 0) return -1;
219 gq = locate_field(b, "GQ", 2);
220 sp = locate_field(b, "SP", 2);
222 for (k = 0; k < b->n_smpl; ++k)
224 max_sp = max_sp > (int)sp[k]? max_sp : sp[k];
226 for (k = 0; k < b->n_smpl; ++k)
228 max_gq = max_gq > (int)gq[k]? max_gq : gq[k];
229 for (k = 0; k < b->n_smpl; ++k) {
231 a1 = gt[k]&7; a2 = gt[k]>>3&7;
232 if ((!a1 && a2) || (!a2 && a1)) { // a het
233 if (gq == 0) ++n_het;
234 else if (gq[k] >= 20) ++n_het;
237 if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499);
238 if (max_sp < 0) max_sp = 0;
239 memset(&str, 0, sizeof(kstring_t));
240 if (*b->info) kputc(';', &str);
241 ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq);
242 bcf_append_info(b, str.s, str.l);
247 // FIXME: only data are shuffled; the header is NOT
248 int bcf_shuffle(bcf1_t *b, int seed)
251 if (seed > 0) srand48(seed);
252 a = malloc(b->n_smpl * sizeof(int));
253 for (i = 0; i < b->n_smpl; ++i) a[i] = i;
254 for (i = b->n_smpl; i > 1; --i) {
256 j = (int)(drand48() * i);
257 tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
259 for (j = 0; j < b->n_gi; ++j) {
260 bcf_ginfo_t *gi = b->gi + j;
261 uint8_t *swap, *data = (uint8_t*)gi->data;
262 swap = malloc(gi->len * b->n_smpl);
263 for (i = 0; i < b->n_smpl; ++i)
264 memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len);
272 bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list)
277 khash_t(str2id) *hash;
279 s.l = s.m = 0; s.s = 0;
280 hash = kh_init(str2id);
281 for (i = 0; i < h0->n_smpl; ++i) {
282 k = kh_put(str2id, hash, h0->sns[i], &ret);
285 for (i = j = 0; i < n; ++i) {
286 k = kh_get(str2id, hash, samples[i]);
287 if (k != kh_end(hash)) {
288 list[j++] = kh_val(hash, k);
289 kputs(samples[i], &s); kputc('\0', &s);
292 if (j < n) fprintf(pysamerr, "<%s> %d samples in the list but not in BCF.", __func__, n - j);
293 kh_destroy(str2id, hash);
294 h = calloc(1, sizeof(bcf_hdr_t));
296 h->ns = 0; h->sns = 0;
297 h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm);
298 h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt);
299 h->l_smpl = s.l; h->sname = s.s;
304 int bcf_subsam(int n_smpl, int *list, bcf1_t *b)
307 for (j = 0; j < b->n_gi; ++j) {
308 bcf_ginfo_t *gi = b->gi + j;
310 swap = malloc(gi->len * b->n_smpl);
311 for (i = 0; i < n_smpl; ++i)
312 memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
320 static int8_t nt4_table[128] = {
321 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
322 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
323 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
324 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
325 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
326 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
327 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
328 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4
331 int bcf_gl10(const bcf1_t *b, uint8_t *gl)
333 int a[4], k, l, map[4], k1, j, i;
334 const bcf_ginfo_t *PL;
336 if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles
337 for (i = 0; i < b->n_gi; ++i)
338 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
339 if (i == b->n_gi) return -1; // no PL
341 a[0] = nt4_table[(int)b->ref[0]];
342 if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T
343 a[1] = a[2] = a[3] = -2; // -1 has a special meaning
344 if (b->alt[0] == 0) return -1; // no alternate allele
345 map[0] = map[1] = map[2] = map[3] = -2;
347 for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) {
348 if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base
349 a[k+1] = nt4_table[(int)*s];
350 if (a[k+1] >= 0) map[a[k+1]] = k+1;
352 if (s[1] == 0) break; // the end of the ALT string
354 for (k = 0; k < 4; ++k)
355 if (map[k] < 0) map[k] = k1;
356 for (i = 0; i < b->n_smpl; ++i) {
357 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
358 uint8_t *g = gl + 10 * i;
359 for (k = j = 0; k < 4; ++k) {
360 for (l = k; l < 4; ++l) {
361 int t, x = map[k], y = map[l];
362 if (x > y) t = x, x = y, y = t; // make sure x is the smaller
363 g[j++] = p[y * (y+1) / 2 + x];
370 int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl)
373 const bcf_ginfo_t *PL;
374 if (b->alt[0] == 0) return -1; // no alternate allele
375 for (i = 0; i < b->n_gi; ++i)
376 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
377 if (i == b->n_gi) return -1; // no PL
379 for (i = 0; i < b->n_smpl; ++i) {
380 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
381 uint8_t *g = gl + 10 * i;
382 for (k = j = 0; k < 4; ++k) {
383 for (l = k; l < 4; ++l) {
385 if (x > y) t = x, x = y, y = t; // make sure x is the smaller
386 x = y * (y+1) / 2 + x;
387 g[j++] = x < PL->len? p[x] : 255;