Merge commit 'upstream/0.1.18'
[samtools.git] / kseq.h
1 /* The MIT License
2
3    Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
4
5    Permission is hereby granted, free of charge, to any person obtaining
6    a copy of this software and associated documentation files (the
7    "Software"), to deal in the Software without restriction, including
8    without limitation the rights to use, copy, modify, merge, publish,
9    distribute, sublicense, and/or sell copies of the Software, and to
10    permit persons to whom the Software is furnished to do so, subject to
11    the following conditions:
12
13    The above copyright notice and this permission notice shall be
14    included in all copies or substantial portions of the Software.
15
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23    SOFTWARE.
24 */
25
26 /* Last Modified: 18AUG2011 */
27
28 #ifndef AC_KSEQ_H
29 #define AC_KSEQ_H
30
31 #include <ctype.h>
32 #include <string.h>
33 #include <stdlib.h>
34
35 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
36 #define KS_SEP_TAB   1 // isspace() && !' '
37 #define KS_SEP_MAX   1
38
39 #define __KS_TYPE(type_t)                                               \
40         typedef struct __kstream_t {                            \
41                 unsigned char *buf;                                             \
42                 int begin, end, is_eof;                                 \
43                 type_t f;                                                               \
44         } kstream_t;
45
46 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
47 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
48
49 #define __KS_BASIC(type_t, __bufsize)                                                           \
50         static inline kstream_t *ks_init(type_t f)                                              \
51         {                                                                                                                               \
52                 kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));       \
53                 ks->f = f;                                                                                                      \
54                 ks->buf = malloc(__bufsize);                                                            \
55                 return ks;                                                                                                      \
56         }                                                                                                                               \
57         static inline void ks_destroy(kstream_t *ks)                                    \
58         {                                                                                                                               \
59                 if (ks) {                                                                                                       \
60                         free(ks->buf);                                                                                  \
61                         free(ks);                                                                                               \
62                 }                                                                                                                       \
63         }
64
65 #define __KS_GETC(__read, __bufsize)                                            \
66         static inline int ks_getc(kstream_t *ks)                                \
67         {                                                                                                               \
68                 if (ks->is_eof && ks->begin >= ks->end) return -1;      \
69                 if (ks->begin >= ks->end) {                                                     \
70                         ks->begin = 0;                                                                  \
71                         ks->end = __read(ks->f, ks->buf, __bufsize);    \
72                         if (ks->end < __bufsize) ks->is_eof = 1;                \
73                         if (ks->end == 0) return -1;                                    \
74                 }                                                                                                       \
75                 return (int)ks->buf[ks->begin++];                                       \
76         }
77
78 #ifndef KSTRING_T
79 #define KSTRING_T kstring_t
80 typedef struct __kstring_t {
81         size_t l, m;
82         char *s;
83 } kstring_t;
84 #endif
85
86 #ifndef kroundup32
87 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
88 #endif
89
90 #define __KS_GETUNTIL(__read, __bufsize)                                                                \
91         static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
92         {                                                                                                                                       \
93                 if (dret) *dret = 0;                                                                                    \
94                 str->l = append? str->l : 0;                                                                    \
95                 if (ks->begin >= ks->end && ks->is_eof) return -1;                              \
96                 for (;;) {                                                                                                              \
97                         int i;                                                                                                          \
98                         if (ks->begin >= ks->end) {                                                                     \
99                                 if (!ks->is_eof) {                                                                              \
100                                         ks->begin = 0;                                                                          \
101                                         ks->end = __read(ks->f, ks->buf, __bufsize);            \
102                                         if (ks->end < __bufsize) ks->is_eof = 1;                        \
103                                         if (ks->end == 0) break;                                                        \
104                                 } else break;                                                                                   \
105                         }                                                                                                                       \
106                         if (delimiter > KS_SEP_MAX) {                                                           \
107                                 for (i = ks->begin; i < ks->end; ++i)                                   \
108                                         if (ks->buf[i] == delimiter) break;                                     \
109                         } else if (delimiter == KS_SEP_SPACE) {                                         \
110                                 for (i = ks->begin; i < ks->end; ++i)                                   \
111                                         if (isspace(ks->buf[i])) break;                                         \
112                         } else if (delimiter == KS_SEP_TAB) {                                           \
113                                 for (i = ks->begin; i < ks->end; ++i)                                   \
114                                         if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
115                         } else i = 0; /* never come to here! */                                         \
116                         if (str->m - str->l < i - ks->begin + 1) {                                      \
117                                 str->m = str->l + (i - ks->begin) + 1;                                  \
118                                 kroundup32(str->m);                                                                             \
119                                 str->s = (char*)realloc(str->s, str->m);                                \
120                         }                                                                                                                       \
121                         memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
122                         str->l = str->l + (i - ks->begin);                                                      \
123                         ks->begin = i + 1;                                                                                      \
124                         if (i < ks->end) {                                                                                      \
125                                 if (dret) *dret = ks->buf[i];                                                   \
126                                 break;                                                                                                  \
127                         }                                                                                                                       \
128                 }                                                                                                                               \
129                 if (str->s == 0) {                                                                                              \
130                         str->m = 1;                                                                                                     \
131                         str->s = (char*)calloc(1, 1);                                                           \
132                 }                                                                                                                               \
133                 str->s[str->l] = '\0';                                                                                  \
134                 return str->l;                                                                                                  \
135         } \
136         static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
137         { return ks_getuntil2(ks, delimiter, str, dret, 0); }
138
139 #define KSTREAM_INIT(type_t, __read, __bufsize) \
140         __KS_TYPE(type_t)                                                       \
141         __KS_BASIC(type_t, __bufsize)                           \
142         __KS_GETC(__read, __bufsize)                            \
143         __KS_GETUNTIL(__read, __bufsize)
144
145 #define __KSEQ_BASIC(type_t)                                                                                    \
146         static inline kseq_t *kseq_init(type_t fd)                                                      \
147         {                                                                                                                                       \
148                 kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));                                 \
149                 s->f = ks_init(fd);                                                                                             \
150                 return s;                                                                                                               \
151         }                                                                                                                                       \
152         static inline void kseq_rewind(kseq_t *ks)                                                      \
153         {                                                                                                                                       \
154                 ks->last_char = 0;                                                                                              \
155                 ks->f->is_eof = ks->f->begin = ks->f->end = 0;                                  \
156         }                                                                                                                                       \
157         static inline void kseq_destroy(kseq_t *ks)                                                     \
158         {                                                                                                                                       \
159                 if (!ks) return;                                                                                                \
160                 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
161                 ks_destroy(ks->f);                                                                                              \
162                 free(ks);                                                                                                               \
163         }
164
165 /* Return value:
166    >=0  length of the sequence (normal)
167    -1   end-of-file
168    -2   truncated quality string
169  */
170 #define __KSEQ_READ \
171         static int kseq_read(kseq_t *seq) \
172         { \
173                 int c; \
174                 kstream_t *ks = seq->f; \
175                 if (seq->last_char == 0) { /* then jump to the next header line */ \
176                         while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
177                         if (c == -1) return -1; /* end of file */ \
178                         seq->last_char = c; \
179                 } /* else: the first header char has been read in the previous call */ \
180                 seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
181                 if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
182                 if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); /* read FASTA/Q comment */ \
183                 if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
184                         seq->seq.m = 256; \
185                         seq->seq.s = (char*)malloc(seq->seq.m); \
186                 } \
187                 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
188                         seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
189                         ks_getuntil2(ks, '\n', &seq->seq, 0, 1); /* read the rest of the line */ \
190                 } \
191                 if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
192                 if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
193                         seq->seq.m = seq->seq.l + 2; \
194                         kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
195                         seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
196                 } \
197                 seq->seq.s[seq->seq.l] = 0;     /* null terminated string */ \
198                 if (c != '+') return seq->seq.l; /* FASTA */ \
199                 if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
200                         seq->qual.m = seq->seq.m; \
201                         seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
202                 } \
203                 while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
204                 if (c == -1) return -2; /* error: no quality string */ \
205                 while (ks_getuntil2(ks, '\n', &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
206                 seq->last_char = 0;     /* we have not come to the next header line */ \
207                 if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
208                 return seq->seq.l; \
209         }
210
211 #define __KSEQ_TYPE(type_t)                                             \
212         typedef struct {                                                        \
213                 kstring_t name, comment, seq, qual;             \
214                 int last_char;                                                  \
215                 kstream_t *f;                                                   \
216         } kseq_t;
217
218 #define KSEQ_INIT(type_t, __read)                               \
219         KSTREAM_INIT(type_t, __read, 16384)                     \
220         __KSEQ_TYPE(type_t)                                                     \
221         __KSEQ_BASIC(type_t)                                            \
222         __KSEQ_READ
223
224 #endif