3 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
4 2011 Attractive Chaos <attractor@live.co.uk>
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
30 #include <sys/types.h>
35 typedef knetFile *_bgzf_file_t;
36 #define _bgzf_open(fn, mode) knet_open(fn, mode)
37 #define _bgzf_dopen(fp, mode) knet_dopen(fp, mode)
38 #define _bgzf_close(fp) knet_close(fp)
39 #define _bgzf_fileno(fp) ((fp)->fd)
40 #define _bgzf_tell(fp) knet_tell(fp)
41 #define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence)
42 #define _bgzf_read(fp, buf, len) knet_read(fp, buf, len)
43 #define _bgzf_write(fp, buf, len) knet_write(fp, buf, len)
44 #else // ~defined(_USE_KNETFILE)
45 #if defined(_WIN32) || defined(_MSC_VER)
46 #define ftello(fp) ftell(fp)
47 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
48 #else // ~defined(_WIN32)
49 extern off_t ftello(FILE *stream);
50 extern int fseeko(FILE *stream, off_t offset, int whence);
51 #endif // ~defined(_WIN32)
52 typedef FILE *_bgzf_file_t;
53 #define _bgzf_open(fn, mode) fopen(fn, mode)
54 #define _bgzf_dopen(fp, mode) fdopen(fp, mode)
55 #define _bgzf_close(fp) fclose(fp)
56 #define _bgzf_fileno(fp) fileno(fp)
57 #define _bgzf_tell(fp) ftello(fp)
58 #define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence)
59 #define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp)
60 #define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp)
61 #endif // ~define(_USE_KNETFILE)
63 #define BLOCK_HEADER_LENGTH 18
64 #define BLOCK_FOOTER_LENGTH 8
66 /* BGZF/GZIP header (speciallized from RFC 1952; little endian):
67 +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
68 | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN|
69 +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
71 static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
80 KHASH_MAP_INIT_INT64(cache, cache_t)
83 static inline void packInt16(uint8_t *buffer, uint16_t value)
86 buffer[1] = value >> 8;
89 static inline int unpackInt16(const uint8_t *buffer)
91 return buffer[0] | buffer[1] << 8;
94 static inline void packInt32(uint8_t *buffer, uint32_t value)
97 buffer[1] = value >> 8;
98 buffer[2] = value >> 16;
99 buffer[3] = value >> 24;
102 static BGZF *bgzf_read_init()
105 fp = calloc(1, sizeof(BGZF));
107 fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE);
108 fp->compressed_block = malloc(BGZF_BLOCK_SIZE);
110 fp->cache = kh_init(cache);
115 static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level
118 fp = calloc(1, sizeof(BGZF));
120 fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE);
121 fp->compressed_block = malloc(BGZF_BLOCK_SIZE);
122 fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
123 if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
126 // get the compress level from the mode string
127 static int mode2level(const char *__restrict mode)
129 int i, compress_level = -1;
130 for (i = 0; mode[i]; ++i)
131 if (mode[i] >= '0' && mode[i] <= '9') break;
132 if (mode[i]) compress_level = (int)mode[i] - '0';
133 if (strchr(mode, 'u')) compress_level = 0;
134 return compress_level;
137 BGZF *bgzf_open(const char *path, const char *mode)
140 if (strchr(mode, 'r') || strchr(mode, 'R')) {
142 if ((fpr = _bgzf_open(path, "r")) == 0) return 0;
143 fp = bgzf_read_init();
145 } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
147 if ((fpw = fopen(path, "w")) == 0) return 0;
148 fp = bgzf_write_init(mode2level(mode));
154 BGZF *bgzf_dopen(int fd, const char *mode)
157 if (strchr(mode, 'r') || strchr(mode, 'R')) {
159 if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0;
160 fp = bgzf_read_init();
162 } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
164 if ((fpw = fdopen(fd, "w")) == 0) return 0;
165 fp = bgzf_write_init(mode2level(mode));
171 // Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
172 static int deflate_block(BGZF *fp, int block_length)
174 uint8_t *buffer = fp->compressed_block;
175 int buffer_size = BGZF_BLOCK_SIZE;
176 int input_length = block_length;
177 int compressed_length = 0;
181 assert(block_length <= BGZF_BLOCK_SIZE); // guaranteed by the caller
182 memcpy(buffer, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
183 while (1) { // loop to retry for blocks that do not compress enough
188 zs.next_in = fp->uncompressed_block;
189 zs.avail_in = input_length;
190 zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
191 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
192 status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY); // -15 to disable zlib header/footer
193 if (status != Z_OK) {
194 fp->errcode |= BGZF_ERR_ZLIB;
197 status = deflate(&zs, Z_FINISH);
198 if (status != Z_STREAM_END) { // not compressed enough
199 deflateEnd(&zs); // reset the stream
200 if (status == Z_OK) { // reduce the size and recompress
201 input_length -= 1024;
202 assert(input_length > 0); // logically, this should not happen
205 fp->errcode |= BGZF_ERR_ZLIB;
208 if (deflateEnd(&zs) != Z_OK) {
209 fp->errcode |= BGZF_ERR_ZLIB;
212 compressed_length = zs.total_out;
213 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
214 assert(compressed_length <= BGZF_BLOCK_SIZE);
218 assert(compressed_length > 0);
219 packInt16((uint8_t*)&buffer[16], compressed_length - 1); // write the compressed_length; -1 to fit 2 bytes
220 crc = crc32(0L, NULL, 0L);
221 crc = crc32(crc, fp->uncompressed_block, input_length);
222 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
223 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
225 remaining = block_length - input_length;
227 assert(remaining <= input_length);
228 memcpy(fp->uncompressed_block, fp->uncompressed_block + input_length, remaining);
230 fp->block_offset = remaining;
231 return compressed_length;
234 // Inflate the block in fp->compressed_block into fp->uncompressed_block
235 static int inflate_block(BGZF* fp, int block_length)
240 zs.next_in = fp->compressed_block + 18;
241 zs.avail_in = block_length - 16;
242 zs.next_out = fp->uncompressed_block;
243 zs.avail_out = BGZF_BLOCK_SIZE;
245 if (inflateInit2(&zs, -15) != Z_OK) {
246 fp->errcode |= BGZF_ERR_ZLIB;
249 if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
251 fp->errcode |= BGZF_ERR_ZLIB;
254 if (inflateEnd(&zs) != Z_OK) {
255 fp->errcode |= BGZF_ERR_ZLIB;
261 static int check_header(const uint8_t *header)
263 return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0
264 && unpackInt16((uint8_t*)&header[10]) == 6
265 && header[12] == 'B' && header[13] == 'C'
266 && unpackInt16((uint8_t*)&header[14]) == 2);
270 static void free_cache(BGZF *fp)
273 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
274 if (fp->open_mode != 'r') return;
275 for (k = kh_begin(h); k < kh_end(h); ++k)
276 if (kh_exist(h, k)) free(kh_val(h, k).block);
277 kh_destroy(cache, h);
280 static int load_block_from_cache(BGZF *fp, int64_t block_address)
284 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
285 k = kh_get(cache, h, block_address);
286 if (k == kh_end(h)) return 0;
288 if (fp->block_length != 0) fp->block_offset = 0;
289 fp->block_address = block_address;
290 fp->block_length = p->size;
291 memcpy(fp->uncompressed_block, p->block, BGZF_BLOCK_SIZE);
292 _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET);
296 static void cache_block(BGZF *fp, int size)
301 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
302 if (BGZF_BLOCK_SIZE >= fp->cache_size) return;
303 if ((kh_size(h) + 1) * BGZF_BLOCK_SIZE > fp->cache_size) {
304 /* A better way would be to remove the oldest block in the
305 * cache, but here we remove a random one for simplicity. This
306 * should not have a big impact on performance. */
307 for (k = kh_begin(h); k < kh_end(h); ++k)
308 if (kh_exist(h, k)) break;
310 free(kh_val(h, k).block);
314 k = kh_put(cache, h, fp->block_address, &ret);
315 if (ret == 0) return; // if this happens, a bug!
317 p->size = fp->block_length;
318 p->end_offset = fp->block_address + size;
319 p->block = malloc(BGZF_BLOCK_SIZE);
320 memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_BLOCK_SIZE);
323 static void free_cache(BGZF *fp) {}
324 static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
325 static void cache_block(BGZF *fp, int size) {}
328 int bgzf_read_block(BGZF *fp)
330 uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
331 int count, size = 0, block_length, remaining;
332 int64_t block_address;
333 block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
334 if (load_block_from_cache(fp, block_address)) return 0;
335 count = _bgzf_read(fp->fp, header, sizeof(header));
336 if (count == 0) { // no data read
337 fp->block_length = 0;
340 if (count != sizeof(header) || !check_header(header)) {
341 fp->errcode |= BGZF_ERR_HEADER;
345 block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
346 compressed_block = (uint8_t*)fp->compressed_block;
347 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
348 remaining = block_length - BLOCK_HEADER_LENGTH;
349 count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
350 if (count != remaining) {
351 fp->errcode |= BGZF_ERR_IO;
355 if ((count = inflate_block(fp, block_length)) < 0) return -1;
356 if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
357 fp->block_address = block_address;
358 fp->block_length = count;
359 cache_block(fp, size);
363 ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length)
365 ssize_t bytes_read = 0;
366 uint8_t *output = data;
367 if (length <= 0) return 0;
368 assert(fp->open_mode == 'r');
369 while (bytes_read < length) {
370 int copy_length, available = fp->block_length - fp->block_offset;
372 if (available <= 0) {
373 if (bgzf_read_block(fp) != 0) return -1;
374 available = fp->block_length - fp->block_offset;
375 if (available <= 0) break;
377 copy_length = length - bytes_read < available? length - bytes_read : available;
378 buffer = fp->uncompressed_block;
379 memcpy(output, buffer + fp->block_offset, copy_length);
380 fp->block_offset += copy_length;
381 output += copy_length;
382 bytes_read += copy_length;
384 if (fp->block_offset == fp->block_length) {
385 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
386 fp->block_offset = fp->block_length = 0;
391 int bgzf_flush(BGZF *fp)
393 assert(fp->open_mode == 'w');
394 while (fp->block_offset > 0) {
396 block_length = deflate_block(fp, fp->block_offset);
397 if (block_length < 0) return -1;
398 if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) {
399 fp->errcode |= BGZF_ERR_IO; // possibly truncated file
402 fp->block_address += block_length;
407 int bgzf_flush_try(BGZF *fp, ssize_t size)
409 if (fp->block_offset + size > BGZF_BLOCK_SIZE)
410 return bgzf_flush(fp);
414 ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length)
416 const uint8_t *input = data;
417 int block_length = BGZF_BLOCK_SIZE, bytes_written;
418 assert(fp->open_mode == 'w');
421 while (bytes_written < length) {
422 uint8_t* buffer = fp->uncompressed_block;
423 int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written;
424 memcpy(buffer + fp->block_offset, input, copy_length);
425 fp->block_offset += copy_length;
426 input += copy_length;
427 bytes_written += copy_length;
428 if (fp->block_offset == block_length && bgzf_flush(fp)) break;
430 return bytes_written;
433 int bgzf_close(BGZF* fp)
435 int ret, count, block_length;
436 if (fp == 0) return -1;
437 if (fp->open_mode == 'w') {
438 if (bgzf_flush(fp) != 0) return -1;
439 block_length = deflate_block(fp, 0); // write an empty block
440 count = fwrite(fp->compressed_block, 1, block_length, fp->fp);
441 if (fflush(fp->fp) != 0) {
442 fp->errcode |= BGZF_ERR_IO;
446 ret = fp->open_mode == 'w'? fclose(fp->fp) : _bgzf_close(fp->fp);
447 if (ret != 0) return -1;
448 free(fp->uncompressed_block);
449 free(fp->compressed_block);
455 void bgzf_set_cache_size(BGZF *fp, int cache_size)
457 if (fp) fp->cache_size = cache_size;
460 int bgzf_check_EOF(BGZF *fp)
462 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
465 offset = _bgzf_tell((_bgzf_file_t)fp->fp);
466 if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0;
467 _bgzf_read(fp->fp, buf, 28);
468 _bgzf_seek(fp->fp, offset, SEEK_SET);
469 return (memcmp(magic, buf, 28) == 0)? 1 : 0;
472 int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
475 int64_t block_address;
477 if (fp->open_mode != 'r' || where != SEEK_SET) {
478 fp->errcode |= BGZF_ERR_MISUSE;
481 block_offset = pos & 0xFFFF;
482 block_address = pos >> 16;
483 if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) {
484 fp->errcode |= BGZF_ERR_IO;
487 fp->block_length = 0; // indicates current block has not been loaded
488 fp->block_address = block_address;
489 fp->block_offset = block_offset;
493 int bgzf_is_bgzf(const char *fn)
498 if ((fp = _bgzf_open(fn, "r")) == 0) return 0;
499 n = _bgzf_read(fp, buf, 16);
501 if (n != 16) return 0;
502 return memcmp(g_magic, buf, 16) == 0? 1 : 0;
505 int bgzf_getc(BGZF *fp)
508 if (fp->block_offset >= fp->block_length) {
509 if (bgzf_read_block(fp) != 0) return -2; /* error */
510 if (fp->block_length == 0) return -1; /* end-of-file */
512 c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
513 if (fp->block_offset == fp->block_length) {
514 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
515 fp->block_offset = 0;
516 fp->block_length = 0;
522 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
525 int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
528 unsigned char *buf = (unsigned char*)fp->uncompressed_block;
531 if (fp->block_offset >= fp->block_length) {
532 if (bgzf_read_block(fp) != 0) { state = -2; break; }
533 if (fp->block_length == 0) { state = -1; break; }
535 for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
536 if (l < fp->block_length) state = 1;
537 l -= fp->block_offset;
538 if (str->l + l + 1 >= str->m) {
539 str->m = str->l + l + 2;
541 str->s = (char*)realloc(str->s, str->m);
543 memcpy(str->s + str->l, buf + fp->block_offset, l);
545 fp->block_offset += l + 1;
546 if (fp->block_offset >= fp->block_length) {
547 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
548 fp->block_offset = 0;
549 fp->block_length = 0;
551 } while (state == 0);
552 if (str->l == 0 && state < 0) return state;