5 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
6 2011 Attractive Chaos <attractor@live.co.uk>
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
32 #include <sys/types.h>
37 typedef knetFile *_bgzf_file_t;
38 #define _bgzf_open(fn, mode) knet_open(fn, mode)
39 #define _bgzf_dopen(fp, mode) knet_dopen(fp, mode)
40 #define _bgzf_close(fp) knet_close(fp)
41 #define _bgzf_fileno(fp) ((fp)->fd)
42 #define _bgzf_tell(fp) knet_tell(fp)
43 #define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence)
44 #define _bgzf_read(fp, buf, len) knet_read(fp, buf, len)
45 #define _bgzf_write(fp, buf, len) knet_write(fp, buf, len)
46 #else // ~defined(_USE_KNETFILE)
47 #if defined(_WIN32) || defined(_MSC_VER)
48 #define ftello(fp) ftell(fp)
49 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
50 #else // ~defined(_WIN32)
51 extern off_t ftello(FILE *stream);
52 extern int fseeko(FILE *stream, off_t offset, int whence);
53 #endif // ~defined(_WIN32)
54 typedef FILE *_bgzf_file_t;
55 #define _bgzf_open(fn, mode) fopen(fn, mode)
56 #define _bgzf_dopen(fp, mode) fdopen(fp, mode)
57 #define _bgzf_close(fp) fclose(fp)
58 #define _bgzf_fileno(fp) fileno(fp)
59 #define _bgzf_tell(fp) ftello(fp)
60 #define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence)
61 #define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp)
62 #define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp)
63 #endif // ~define(_USE_KNETFILE)
65 #define BLOCK_HEADER_LENGTH 18
66 #define BLOCK_FOOTER_LENGTH 8
68 /* BGZF/GZIP header (speciallized from RFC 1952; little endian):
69 +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
70 | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN|
71 +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
73 static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
82 KHASH_MAP_INIT_INT64(cache, cache_t)
85 static inline void packInt16(uint8_t *buffer, uint16_t value)
88 buffer[1] = value >> 8;
91 static inline int unpackInt16(const uint8_t *buffer)
93 return buffer[0] | buffer[1] << 8;
96 static inline void packInt32(uint8_t *buffer, uint32_t value)
99 buffer[1] = value >> 8;
100 buffer[2] = value >> 16;
101 buffer[3] = value >> 24;
104 static BGZF *bgzf_read_init()
107 fp = calloc(1, sizeof(BGZF));
109 fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE);
110 fp->compressed_block = malloc(BGZF_BLOCK_SIZE);
112 fp->cache = kh_init(cache);
117 static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level
120 fp = calloc(1, sizeof(BGZF));
122 fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE);
123 fp->compressed_block = malloc(BGZF_BLOCK_SIZE);
124 fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
125 if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
128 // get the compress level from the mode string
129 static int mode2level(const char *__restrict mode)
131 int i, compress_level = -1;
132 for (i = 0; mode[i]; ++i)
133 if (mode[i] >= '0' && mode[i] <= '9') break;
134 if (mode[i]) compress_level = (int)mode[i] - '0';
135 if (strchr(mode, 'u')) compress_level = 0;
136 return compress_level;
139 BGZF *bgzf_open(const char *path, const char *mode)
142 if (strchr(mode, 'r') || strchr(mode, 'R')) {
144 if ((fpr = _bgzf_open(path, "r")) == 0) return 0;
145 fp = bgzf_read_init();
147 } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
149 if ((fpw = fopen(path, "w")) == 0) return 0;
150 fp = bgzf_write_init(mode2level(mode));
156 BGZF *bgzf_dopen(int fd, const char *mode)
159 if (strchr(mode, 'r') || strchr(mode, 'R')) {
161 if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0;
162 fp = bgzf_read_init();
164 } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
166 if ((fpw = fdopen(fd, "w")) == 0) return 0;
167 fp = bgzf_write_init(mode2level(mode));
173 // Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
174 static int deflate_block(BGZF *fp, int block_length)
176 uint8_t *buffer = fp->compressed_block;
177 int buffer_size = BGZF_BLOCK_SIZE;
178 int input_length = block_length;
179 int compressed_length = 0;
183 assert(block_length <= BGZF_BLOCK_SIZE); // guaranteed by the caller
184 memcpy(buffer, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
185 while (1) { // loop to retry for blocks that do not compress enough
190 zs.next_in = fp->uncompressed_block;
191 zs.avail_in = input_length;
192 zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
193 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
194 status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY); // -15 to disable zlib header/footer
195 if (status != Z_OK) {
196 fp->errcode |= BGZF_ERR_ZLIB;
199 status = deflate(&zs, Z_FINISH);
200 if (status != Z_STREAM_END) { // not compressed enough
201 deflateEnd(&zs); // reset the stream
202 if (status == Z_OK) { // reduce the size and recompress
203 input_length -= 1024;
204 assert(input_length > 0); // logically, this should not happen
207 fp->errcode |= BGZF_ERR_ZLIB;
210 if (deflateEnd(&zs) != Z_OK) {
211 fp->errcode |= BGZF_ERR_ZLIB;
214 compressed_length = zs.total_out;
215 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
216 assert(compressed_length <= BGZF_BLOCK_SIZE);
220 assert(compressed_length > 0);
221 packInt16((uint8_t*)&buffer[16], compressed_length - 1); // write the compressed_length; -1 to fit 2 bytes
222 crc = crc32(0L, NULL, 0L);
223 crc = crc32(crc, fp->uncompressed_block, input_length);
224 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
225 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
227 remaining = block_length - input_length;
229 assert(remaining <= input_length);
230 memcpy(fp->uncompressed_block, fp->uncompressed_block + input_length, remaining);
232 fp->block_offset = remaining;
233 return compressed_length;
236 // Inflate the block in fp->compressed_block into fp->uncompressed_block
237 static int inflate_block(BGZF* fp, int block_length)
242 zs.next_in = fp->compressed_block + 18;
243 zs.avail_in = block_length - 16;
244 zs.next_out = fp->uncompressed_block;
245 zs.avail_out = BGZF_BLOCK_SIZE;
247 if (inflateInit2(&zs, -15) != Z_OK) {
248 fp->errcode |= BGZF_ERR_ZLIB;
251 if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
253 fp->errcode |= BGZF_ERR_ZLIB;
256 if (inflateEnd(&zs) != Z_OK) {
257 fp->errcode |= BGZF_ERR_ZLIB;
263 static int check_header(const uint8_t *header)
265 return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0
266 && unpackInt16((uint8_t*)&header[10]) == 6
267 && header[12] == 'B' && header[13] == 'C'
268 && unpackInt16((uint8_t*)&header[14]) == 2);
272 static void free_cache(BGZF *fp)
275 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
276 if (fp->open_mode != 'r') return;
277 for (k = kh_begin(h); k < kh_end(h); ++k)
278 if (kh_exist(h, k)) free(kh_val(h, k).block);
279 kh_destroy(cache, h);
282 static int load_block_from_cache(BGZF *fp, int64_t block_address)
286 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
287 k = kh_get(cache, h, block_address);
288 if (k == kh_end(h)) return 0;
290 if (fp->block_length != 0) fp->block_offset = 0;
291 fp->block_address = block_address;
292 fp->block_length = p->size;
293 memcpy(fp->uncompressed_block, p->block, BGZF_BLOCK_SIZE);
294 _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET);
298 static void cache_block(BGZF *fp, int size)
303 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
304 if (BGZF_BLOCK_SIZE >= fp->cache_size) return;
305 if ((kh_size(h) + 1) * BGZF_BLOCK_SIZE > fp->cache_size) {
306 /* A better way would be to remove the oldest block in the
307 * cache, but here we remove a random one for simplicity. This
308 * should not have a big impact on performance. */
309 for (k = kh_begin(h); k < kh_end(h); ++k)
310 if (kh_exist(h, k)) break;
312 free(kh_val(h, k).block);
316 k = kh_put(cache, h, fp->block_address, &ret);
317 if (ret == 0) return; // if this happens, a bug!
319 p->size = fp->block_length;
320 p->end_offset = fp->block_address + size;
321 p->block = malloc(BGZF_BLOCK_SIZE);
322 memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_BLOCK_SIZE);
325 static void free_cache(BGZF *fp) {}
326 static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
327 static void cache_block(BGZF *fp, int size) {}
330 int bgzf_read_block(BGZF *fp)
332 uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
333 int count, size = 0, block_length, remaining;
334 int64_t block_address;
335 block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
336 if (load_block_from_cache(fp, block_address)) return 0;
337 count = _bgzf_read(fp->fp, header, sizeof(header));
338 if (count == 0) { // no data read
339 fp->block_length = 0;
342 if (count != sizeof(header) || !check_header(header)) {
343 fp->errcode |= BGZF_ERR_HEADER;
347 block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
348 compressed_block = (uint8_t*)fp->compressed_block;
349 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
350 remaining = block_length - BLOCK_HEADER_LENGTH;
351 count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
352 if (count != remaining) {
353 fp->errcode |= BGZF_ERR_IO;
357 if ((count = inflate_block(fp, block_length)) < 0) return -1;
358 if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
359 fp->block_address = block_address;
360 fp->block_length = count;
361 cache_block(fp, size);
365 ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length)
367 ssize_t bytes_read = 0;
368 uint8_t *output = data;
369 if (length <= 0) return 0;
370 assert(fp->open_mode == 'r');
371 while (bytes_read < length) {
372 int copy_length, available = fp->block_length - fp->block_offset;
374 if (available <= 0) {
375 if (bgzf_read_block(fp) != 0) return -1;
376 available = fp->block_length - fp->block_offset;
377 if (available <= 0) break;
379 copy_length = length - bytes_read < available? length - bytes_read : available;
380 buffer = fp->uncompressed_block;
381 memcpy(output, buffer + fp->block_offset, copy_length);
382 fp->block_offset += copy_length;
383 output += copy_length;
384 bytes_read += copy_length;
386 if (fp->block_offset == fp->block_length) {
387 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
388 fp->block_offset = fp->block_length = 0;
393 int bgzf_flush(BGZF *fp)
395 assert(fp->open_mode == 'w');
396 while (fp->block_offset > 0) {
398 block_length = deflate_block(fp, fp->block_offset);
399 if (block_length < 0) return -1;
400 if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) {
401 fp->errcode |= BGZF_ERR_IO; // possibly truncated file
404 fp->block_address += block_length;
409 int bgzf_flush_try(BGZF *fp, ssize_t size)
411 if (fp->block_offset + size > BGZF_BLOCK_SIZE)
412 return bgzf_flush(fp);
416 ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length)
418 const uint8_t *input = data;
419 int block_length = BGZF_BLOCK_SIZE, bytes_written;
420 assert(fp->open_mode == 'w');
423 while (bytes_written < length) {
424 uint8_t* buffer = fp->uncompressed_block;
425 int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written;
426 memcpy(buffer + fp->block_offset, input, copy_length);
427 fp->block_offset += copy_length;
428 input += copy_length;
429 bytes_written += copy_length;
430 if (fp->block_offset == block_length && bgzf_flush(fp)) break;
432 return bytes_written;
435 int bgzf_close(BGZF* fp)
437 int ret, count, block_length;
438 if (fp == 0) return -1;
439 if (fp->open_mode == 'w') {
440 if (bgzf_flush(fp) != 0) return -1;
441 block_length = deflate_block(fp, 0); // write an empty block
442 count = fwrite(fp->compressed_block, 1, block_length, fp->fp);
443 if (fflush(fp->fp) != 0) {
444 fp->errcode |= BGZF_ERR_IO;
448 ret = fp->open_mode == 'w'? fclose(fp->fp) : _bgzf_close(fp->fp);
449 if (ret != 0) return -1;
450 free(fp->uncompressed_block);
451 free(fp->compressed_block);
457 void bgzf_set_cache_size(BGZF *fp, int cache_size)
459 if (fp) fp->cache_size = cache_size;
462 int bgzf_check_EOF(BGZF *fp)
464 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
467 offset = _bgzf_tell((_bgzf_file_t)fp->fp);
468 if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0;
469 _bgzf_read(fp->fp, buf, 28);
470 _bgzf_seek(fp->fp, offset, SEEK_SET);
471 return (memcmp(magic, buf, 28) == 0)? 1 : 0;
474 int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
477 int64_t block_address;
479 if (fp->open_mode != 'r' || where != SEEK_SET) {
480 fp->errcode |= BGZF_ERR_MISUSE;
483 block_offset = pos & 0xFFFF;
484 block_address = pos >> 16;
485 if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) {
486 fp->errcode |= BGZF_ERR_IO;
489 fp->block_length = 0; // indicates current block has not been loaded
490 fp->block_address = block_address;
491 fp->block_offset = block_offset;
495 int bgzf_is_bgzf(const char *fn)
500 if ((fp = _bgzf_open(fn, "r")) == 0) return 0;
501 n = _bgzf_read(fp, buf, 16);
503 if (n != 16) return 0;
504 return memcmp(g_magic, buf, 16) == 0? 1 : 0;
507 int bgzf_getc(BGZF *fp)
510 if (fp->block_offset >= fp->block_length) {
511 if (bgzf_read_block(fp) != 0) return -2; /* error */
512 if (fp->block_length == 0) return -1; /* end-of-file */
514 c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
515 if (fp->block_offset == fp->block_length) {
516 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
517 fp->block_offset = 0;
518 fp->block_length = 0;
524 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
527 int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
530 unsigned char *buf = (unsigned char*)fp->uncompressed_block;
533 if (fp->block_offset >= fp->block_length) {
534 if (bgzf_read_block(fp) != 0) { state = -2; break; }
535 if (fp->block_length == 0) { state = -1; break; }
537 for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
538 if (l < fp->block_length) state = 1;
539 l -= fp->block_offset;
540 if (str->l + l + 1 >= str->m) {
541 str->m = str->l + l + 2;
543 str->s = (char*)realloc(str->s, str->m);
545 memcpy(str->s + str->l, buf + fp->block_offset, l);
547 fp->block_offset += l + 1;
548 if (fp->block_offset >= fp->block_length) {
549 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
550 fp->block_offset = 0;
551 fp->block_length = 0;
553 } while (state == 0);
554 if (str->l == 0 && state < 0) return state;