X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=blobdiff_plain;f=bcftools%2Fvcf.c;h=9daa845cba6c13b7d2190fbf77f4877b7979178c;hp=9b661ff5b4fbc29dde6a53b5e71c027f131a81f4;hb=c34624801b980425af68c3c431423c72b18c14fe;hpb=f2f3968e11eead9ce5601b01890bc2339ff951e9 diff --git a/bcftools/vcf.c b/bcftools/vcf.c index 9b661ff..9daa845 100644 --- a/bcftools/vcf.c +++ b/bcftools/vcf.c @@ -72,6 +72,33 @@ bcf_t *vcf_open(const char *fn, const char *mode) return bp; } +int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn) +{ + vcf_t *v; + gzFile fp; + kstream_t *ks; + kstring_t s, rn; + int dret; + if (bp == 0) return -1; + if (!bp->is_vcf) return 0; + s.l = s.m = 0; s.s = 0; + rn.m = rn.l = h->l_nm; rn.s = h->name; + v = (vcf_t*)bp->v; + fp = gzopen(fn, "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, &s, &dret) >= 0) { + bcf_str2id_add(v->refhash, strdup(s.s)); + kputs(s.s, &rn); kputc('\0', &rn); + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); + } + ks_destroy(ks); + gzclose(fp); + h->l_nm = rn.l; h->name = rn.s; + bcf_hdr_sync(h); + free(s.s); + return 0; +} + int vcf_close(bcf_t *bp) { vcf_t *v; @@ -84,7 +111,7 @@ int vcf_close(bcf_t *bp) } if (v->fpout) fclose(v->fpout); free(v->line.s); - bcf_str2id_destroy(v->refhash); + bcf_str2id_thorough_destroy(v->refhash); free(v); free(bp); return 0; @@ -93,15 +120,14 @@ int vcf_close(bcf_t *bp) int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h) { vcf_t *v = (vcf_t*)bp->v; - int i, has_ref = 0, has_ver = 0; + int i, has_ver = 0; if (!bp->is_vcf) return bcf_hdr_write(bp, h); if (h->l_txt > 0) { if (strstr(h->txt, "##fileformat=")) has_ver = 1; - if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n"); + if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n"); fwrite(h->txt, 1, h->l_txt - 1, v->fpout); - if (strstr(h->txt, "##SQ=")) has_ref = 1; } - if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n"); + if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n"); fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); for (i = 0; i < h->n_smpl; ++i) fprintf(v->fpout, "\t%s", h->sns[i]); @@ -138,7 +164,7 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) if (k == 0) { // ref int tid = bcf_str2id(v->refhash, p); if (tid < 0) { - tid = bcf_str2id_add(v->refhash, p); + tid = bcf_str2id_add(v->refhash, strdup(p)); kputs(p, &rn); kputc('\0', &rn); sync = 1; } @@ -156,8 +182,10 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("GT", 2)) { ((uint8_t*)b->gi[i].data)[k-9] = 1<<7; - } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("SP", 2)) { + } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { ((uint8_t*)b->gi[i].data)[k-9] = 0; + } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { + ((int32_t*)b->gi[i].data)[k-9] = 0; } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { ((uint16_t*)b->gi[i].data)[k-9] = 0; } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { @@ -173,11 +201,15 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) { if (b->gi[i].fmt == bcf_str2int("GT", 2)) { ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6; - } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("SP", 2)) { + } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { double _x = strtod(q, &q); int x = (int)(_x + .499); if (x > 255) x = 255; ((uint8_t*)b->gi[i].data)[k-9] = x; + } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { + int x = strtol(q, &q, 10); + if (x > 0xffff) x = 0xffff; + ((uint32_t*)b->gi[i].data)[k-9] = x; } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { int x = strtol(q, &q, 10); if (x > 0xffff) x = 0xffff; @@ -198,7 +230,7 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) y = b->n_alleles * (b->n_alleles + 1) / 2; for (j = 0; j < y; ++j) { x = strtod(q, &q); - data[(k-9) * y + j] = x; + data[(k-9) * y + j] = x > 0? -x/10. : x; ++q; } }