Imported Upstream version 0.5
[pysam.git] / samtools / bcftools / vcf.c.pysam.c
1 #include "pysam.h"
2
3 #include <zlib.h>
4 #include <stdlib.h>
5 #include <stdio.h>
6 #include <string.h>
7 #include "bcf.h"
8 #include "kstring.h"
9 #include "kseq.h"
10 KSTREAM_INIT(gzFile, gzread, 4096)
11
12 typedef struct {
13         gzFile fp;
14         FILE *fpout;
15         kstream_t *ks;
16         void *refhash;
17         kstring_t line;
18         int max_ref;
19 } vcf_t;
20
21 bcf_hdr_t *vcf_hdr_read(bcf_t *bp)
22 {
23         kstring_t meta, smpl;
24         int dret;
25         vcf_t *v;
26         bcf_hdr_t *h;
27         if (!bp->is_vcf) return bcf_hdr_read(bp);
28         h = calloc(1, sizeof(bcf_hdr_t));
29         v = (vcf_t*)bp->v;
30         v->line.l = 0;
31         memset(&meta, 0, sizeof(kstring_t));
32         memset(&smpl, 0, sizeof(kstring_t));
33         while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) {
34                 if (v->line.l < 2) continue;
35                 if (v->line.s[0] != '#') return 0; // no sample line
36                 if (v->line.s[0] == '#' && v->line.s[1] == '#') {
37                         kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta);
38                 } else if (v->line.s[0] == '#') {
39                         int k;
40                         ks_tokaux_t aux;
41                         char *p;
42                         for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
43                                 if (k >= 9) {
44                                         kputsn(p, aux.p - p, &smpl);
45                                         kputc('\0', &smpl);
46                                 }
47                         }
48                         break;
49                 }
50         }
51         kputc('\0', &meta);
52         h->name = 0;
53         h->sname = smpl.s; h->l_smpl = smpl.l;
54         h->txt = meta.s; h->l_txt = meta.l;
55         bcf_hdr_sync(h);
56         return h;
57 }
58
59 bcf_t *vcf_open(const char *fn, const char *mode)
60 {
61         bcf_t *bp;
62         vcf_t *v;
63         if (strchr(mode, 'b')) return bcf_open(fn, mode);
64         bp = calloc(1, sizeof(bcf_t));
65         v = calloc(1, sizeof(vcf_t));
66         bp->is_vcf = 1;
67         bp->v = v;
68         v->refhash = bcf_str2id_init();
69         if (strchr(mode, 'r')) {
70                 v->fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
71                 v->ks = ks_init(v->fp);
72         } else if (strchr(mode, 'w'))
73                 v->fpout = strcmp(fn, "-")? fopen(fn, "w") : stdout;
74         return bp;
75 }
76
77 int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn)
78 {
79         vcf_t *v;
80         gzFile fp;
81         kstream_t *ks;
82         kstring_t s, rn;
83         int dret;
84         if (bp == 0) return -1;
85         if (!bp->is_vcf) return 0;
86         s.l = s.m = 0; s.s = 0;
87         rn.m = rn.l = h->l_nm; rn.s = h->name;
88         v = (vcf_t*)bp->v;
89         fp = gzopen(fn, "r");
90         ks = ks_init(fp);
91         while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
92                 bcf_str2id_add(v->refhash, strdup(s.s));
93                 kputs(s.s, &rn); kputc('\0', &rn);
94                 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
95         }
96         ks_destroy(ks);
97         gzclose(fp);
98         h->l_nm = rn.l; h->name = rn.s;
99         bcf_hdr_sync(h);
100         free(s.s);
101         return 0;
102 }
103
104 int vcf_close(bcf_t *bp)
105 {
106         vcf_t *v;
107         if (bp == 0) return -1;
108         if (!bp->is_vcf) return bcf_close(bp);
109         v = (vcf_t*)bp->v;
110         if (v->fp) {
111                 ks_destroy(v->ks);
112                 gzclose(v->fp);
113         }
114         if (v->fpout) fclose(v->fpout);
115         free(v->line.s);
116         bcf_str2id_thorough_destroy(v->refhash);
117         free(v);
118         free(bp);
119         return 0;
120 }
121
122 int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h)
123 {
124         vcf_t *v = (vcf_t*)bp->v;
125         int i, has_ver = 0;
126         if (!bp->is_vcf) return bcf_hdr_write(bp, h);
127         if (h->l_txt > 0) {
128                 if (strstr(h->txt, "##fileformat=")) has_ver = 1;
129                 if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n");
130                 fwrite(h->txt, 1, h->l_txt - 1, v->fpout);
131         }
132         if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n");
133         fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
134         for (i = 0; i < h->n_smpl; ++i)
135                 fprintf(v->fpout, "\t%s", h->sns[i]);
136         fputc('\n', v->fpout);
137         return 0;
138 }
139
140 int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
141 {
142         vcf_t *v = (vcf_t*)bp->v;
143         extern void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s);
144         if (!bp->is_vcf) return bcf_write(bp, h, b);
145         bcf_fmt_core(h, b, &v->line);
146         fwrite(v->line.s, 1, v->line.l, v->fpout);
147         fputc('\n', v->fpout);
148         return v->line.l + 1;
149 }
150
151 int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
152 {
153         int dret, k, i, sync = 0;
154         vcf_t *v = (vcf_t*)bp->v;
155         char *p, *q;
156         kstring_t str, rn;
157         ks_tokaux_t aux, a2;
158         if (!bp->is_vcf) return bcf_read(bp, h, b);
159         v->line.l = 0;
160         str.l = 0; str.m = b->m_str; str.s = b->str;
161         rn.l = rn.m = h->l_nm; rn.s = h->name;
162         if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1;
163         b->n_smpl = h->n_smpl;
164         for (p = kstrtok(v->line.s, "\t", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
165                 *(char*)aux.p = 0;
166                 if (k == 0) { // ref
167                         int tid = bcf_str2id(v->refhash, p);
168                         if (tid < 0) {
169                                 tid = bcf_str2id_add(v->refhash, strdup(p));
170                                 kputs(p, &rn); kputc('\0', &rn);
171                                 sync = 1;
172                         }
173                         b->tid = tid;
174                 } else if (k == 1) { // pos
175                         b->pos = atoi(p) - 1;
176                 } else if (k == 5) { // qual
177                         b->qual = (p[0] >= '0' && p[0] <= '9')? atof(p) : 0;
178                 } else if (k <= 8) { // variable length strings
179                         kputs(p, &str); kputc('\0', &str);
180                         b->l_str = str.l; b->m_str = str.m; b->str = str.s;
181                         if (k == 8) bcf_sync(b);
182                 } else { // k > 9
183                         if (strncmp(p, "./.", 3) == 0) {
184                                 for (i = 0; i < b->n_gi; ++i) {
185                                         if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
186                                                 ((uint8_t*)b->gi[i].data)[k-9] = 1<<7;
187                                         } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
188                                                 ((uint8_t*)b->gi[i].data)[k-9] = 0;
189                                         } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
190                                                 ((int32_t*)b->gi[i].data)[k-9] = 0;
191                                         } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
192                                                 ((uint16_t*)b->gi[i].data)[k-9] = 0;
193                                         } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
194                                                 int y = b->n_alleles * (b->n_alleles + 1) / 2;
195                                                 memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y);
196                                         } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
197                                                 int y = b->n_alleles * (b->n_alleles + 1) / 2;
198                                                 memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4);
199                                         }
200                                 }
201                                 goto endblock;
202                         }
203                         for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) {
204                                 if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
205                                         ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6;
206                                 } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
207                                         double _x = strtod(q, &q);
208                                         int x = (int)(_x + .499);
209                                         if (x > 255) x = 255;
210                                         ((uint8_t*)b->gi[i].data)[k-9] = x;
211                                 } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
212                                         int x = strtol(q, &q, 10);
213                                         if (x > 0xffff) x = 0xffff;
214                                         ((uint32_t*)b->gi[i].data)[k-9] = x;
215                                 } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
216                                         int x = strtol(q, &q, 10);
217                                         if (x > 0xffff) x = 0xffff;
218                                         ((uint16_t*)b->gi[i].data)[k-9] = x;
219                                 } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
220                                         int x, y, j;
221                                         uint8_t *data = (uint8_t*)b->gi[i].data;
222                                         y = b->n_alleles * (b->n_alleles + 1) / 2;
223                                         for (j = 0; j < y; ++j) {
224                                                 x = strtol(q, &q, 10);
225                                                 if (x > 255) x = 255;
226                                                 data[(k-9) * y + j] = x;
227                                                 ++q;
228                                         }
229                                 } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
230                                         int j, y;
231                                         float x, *data = (float*)b->gi[i].data;
232                                         y = b->n_alleles * (b->n_alleles + 1) / 2;
233                                         for (j = 0; j < y; ++j) {
234                                                 x = strtod(q, &q);
235                                                 data[(k-9) * y + j] = x > 0? -x/10. : x;
236                                                 ++q;
237                                         }
238                                 }
239                         }
240                 endblock: i = i;
241                 }
242         }
243         h->l_nm = rn.l; h->name = rn.s;
244         if (sync) bcf_hdr_sync(h);
245         return v->line.l + 1;
246 }