# New upstream release (new copyright notices and licenses proofchecked).
[samtools.git] / kseq.h
1 /* The MIT License
2
3    Copyright (c) 2008 Genome Research Ltd (GRL).
4
5    Permission is hereby granted, free of charge, to any person obtaining
6    a copy of this software and associated documentation files (the
7    "Software"), to deal in the Software without restriction, including
8    without limitation the rights to use, copy, modify, merge, publish,
9    distribute, sublicense, and/or sell copies of the Software, and to
10    permit persons to whom the Software is furnished to do so, subject to
11    the following conditions:
12
13    The above copyright notice and this permission notice shall be
14    included in all copies or substantial portions of the Software.
15
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23    SOFTWARE.
24 */
25
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
28 /*
29   2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*"
30  */
31
32 /* Last Modified: 12APR2009 */
33
34 #ifndef AC_KSEQ_H
35 #define AC_KSEQ_H
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
42 #define KS_SEP_TAB   1 // isspace() && !' '
43 #define KS_SEP_MAX   1
44
45 #define __KS_TYPE(type_t)                                               \
46         typedef struct __kstream_t {                            \
47                 unsigned char *buf;                                             \
48                 int begin, end, is_eof;                                 \
49                 type_t f;                                                               \
50         } kstream_t;
51
52 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
53 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
54
55 #define __KS_BASIC(type_t, __bufsize)                                                           \
56         static inline kstream_t *ks_init(type_t f)                                              \
57         {                                                                                                                               \
58                 kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));       \
59                 ks->f = f;                                                                                                      \
60                 ks->buf = malloc(__bufsize);                                                            \
61                 return ks;                                                                                                      \
62         }                                                                                                                               \
63         static inline void ks_destroy(kstream_t *ks)                                    \
64         {                                                                                                                               \
65                 if (ks) {                                                                                                       \
66                         free(ks->buf);                                                                                  \
67                         free(ks);                                                                                               \
68                 }                                                                                                                       \
69         }
70
71 #define __KS_GETC(__read, __bufsize)                                            \
72         static inline int ks_getc(kstream_t *ks)                                \
73         {                                                                                                               \
74                 if (ks->is_eof && ks->begin >= ks->end) return -1;      \
75                 if (ks->begin >= ks->end) {                                                     \
76                         ks->begin = 0;                                                                  \
77                         ks->end = __read(ks->f, ks->buf, __bufsize);    \
78                         if (ks->end < __bufsize) ks->is_eof = 1;                \
79                         if (ks->end == 0) return -1;                                    \
80                 }                                                                                                       \
81                 return (int)ks->buf[ks->begin++];                                       \
82         }
83
84 #ifndef KSTRING_T
85 #define KSTRING_T kstring_t
86 typedef struct __kstring_t {
87         size_t l, m;
88         char *s;
89 } kstring_t;
90 #endif
91
92 #ifndef kroundup32
93 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
94 #endif
95
96 #define __KS_GETUNTIL(__read, __bufsize)                                                                \
97         static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
98         {                                                                                                                                       \
99                 if (dret) *dret = 0;                                                                                    \
100                 str->l = 0;                                                                                                             \
101                 if (ks->begin >= ks->end && ks->is_eof) return -1;                              \
102                 for (;;) {                                                                                                              \
103                         int i;                                                                                                          \
104                         if (ks->begin >= ks->end) {                                                                     \
105                                 if (!ks->is_eof) {                                                                              \
106                                         ks->begin = 0;                                                                          \
107                                         ks->end = __read(ks->f, ks->buf, __bufsize);            \
108                                         if (ks->end < __bufsize) ks->is_eof = 1;                        \
109                                         if (ks->end == 0) break;                                                        \
110                                 } else break;                                                                                   \
111                         }                                                                                                                       \
112                         if (delimiter > KS_SEP_MAX) {                                                           \
113                                 for (i = ks->begin; i < ks->end; ++i)                                   \
114                                         if (ks->buf[i] == delimiter) break;                                     \
115                         } else if (delimiter == KS_SEP_SPACE) {                                         \
116                                 for (i = ks->begin; i < ks->end; ++i)                                   \
117                                         if (isspace(ks->buf[i])) break;                                         \
118                         } else if (delimiter == KS_SEP_TAB) {                                           \
119                                 for (i = ks->begin; i < ks->end; ++i)                                   \
120                                         if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
121                         } else i = 0; /* never come to here! */                                         \
122                         if (str->m - str->l < i - ks->begin + 1) {                                      \
123                                 str->m = str->l + (i - ks->begin) + 1;                                  \
124                                 kroundup32(str->m);                                                                             \
125                                 str->s = (char*)realloc(str->s, str->m);                                \
126                         }                                                                                                                       \
127                         memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
128                         str->l = str->l + (i - ks->begin);                                                      \
129                         ks->begin = i + 1;                                                                                      \
130                         if (i < ks->end) {                                                                                      \
131                                 if (dret) *dret = ks->buf[i];                                                   \
132                                 break;                                                                                                  \
133                         }                                                                                                                       \
134                 }                                                                                                                               \
135                 if (str->l == 0) {                                                                                              \
136                         str->m = 1;                                                                                                     \
137                         str->s = (char*)calloc(1, 1);                                                           \
138                 }                                                                                                                               \
139                 str->s[str->l] = '\0';                                                                                  \
140                 return str->l;                                                                                                  \
141         }
142
143 #define KSTREAM_INIT(type_t, __read, __bufsize) \
144         __KS_TYPE(type_t)                                                       \
145         __KS_BASIC(type_t, __bufsize)                           \
146         __KS_GETC(__read, __bufsize)                            \
147         __KS_GETUNTIL(__read, __bufsize)
148
149 #define __KSEQ_BASIC(type_t)                                                                                    \
150         static inline kseq_t *kseq_init(type_t fd)                                                      \
151         {                                                                                                                                       \
152                 kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));                                 \
153                 s->f = ks_init(fd);                                                                                             \
154                 return s;                                                                                                               \
155         }                                                                                                                                       \
156         static inline void kseq_rewind(kseq_t *ks)                                                      \
157         {                                                                                                                                       \
158                 ks->last_char = 0;                                                                                              \
159                 ks->f->is_eof = ks->f->begin = ks->f->end = 0;                                  \
160         }                                                                                                                                       \
161         static inline void kseq_destroy(kseq_t *ks)                                                     \
162         {                                                                                                                                       \
163                 if (!ks) return;                                                                                                \
164                 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
165                 ks_destroy(ks->f);                                                                                              \
166                 free(ks);                                                                                                               \
167         }
168
169 /* Return value:
170    >=0  length of the sequence (normal)
171    -1   end-of-file
172    -2   truncated quality string
173  */
174 #define __KSEQ_READ                                                                                                             \
175         static int kseq_read(kseq_t *seq)                                                                       \
176         {                                                                                                                                       \
177                 int c;                                                                                                                  \
178                 kstream_t *ks = seq->f;                                                                                 \
179                 if (seq->last_char == 0) { /* then jump to the next header line */ \
180                         while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');        \
181                         if (c == -1) return -1; /* end of file */                                       \
182                         seq->last_char = c;                                                                                     \
183                 } /* the first header char has been read */                                             \
184                 seq->comment.l = seq->seq.l = seq->qual.l = 0;                                  \
185                 if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;                  \
186                 if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0);                 \
187                 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
188                         if (isgraph(c)) { /* printable non-space character */           \
189                                 if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
190                                         seq->seq.m = seq->seq.l + 2;                                            \
191                                         kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
192                                         seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
193                                 }                                                                                                               \
194                                 seq->seq.s[seq->seq.l++] = (char)c;                                             \
195                         }                                                                                                                       \
196                 }                                                                                                                               \
197                 if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
198                 seq->seq.s[seq->seq.l] = 0;     /* null terminated string */            \
199                 if (c != '+') return seq->seq.l; /* FASTA */                                    \
200                 if (seq->qual.m < seq->seq.m) { /* allocate enough memory */    \
201                         seq->qual.m = seq->seq.m;                                                                       \
202                         seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);         \
203                 }                                                                                                                               \
204                 while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
205                 if (c == -1) return -2; /* we should not stop here */                   \
206                 while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)             \
207                         if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
208                 seq->qual.s[seq->qual.l] = 0; /* null terminated string */              \
209                 seq->last_char = 0;     /* we have not come to the next header line */ \
210                 if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
211                 return seq->seq.l;                                                                                              \
212         }
213
214 #define __KSEQ_TYPE(type_t)                                             \
215         typedef struct {                                                        \
216                 kstring_t name, comment, seq, qual;             \
217                 int last_char;                                                  \
218                 kstream_t *f;                                                   \
219         } kseq_t;
220
221 #define KSEQ_INIT(type_t, __read)                               \
222         KSTREAM_INIT(type_t, __read, 4096)                      \
223         __KSEQ_TYPE(type_t)                                                     \
224         __KSEQ_BASIC(type_t)                                            \
225         __KSEQ_READ
226
227 #endif