5 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
27 2009-06-29 by lh3: cache recent uncompressed blocks.
28 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
29 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
36 #include <sys/types.h>
46 KHASH_MAP_INIT_INT64(cache, cache_t)
48 #if defined(_WIN32) || defined(_MSC_VER)
49 #define ftello(fp) ftell(fp)
50 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
52 extern off_t ftello(FILE *stream);
53 extern int fseeko(FILE *stream, off_t offset, int whence);
56 typedef int8_t bgzf_byte_t;
58 static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
59 static const int MAX_BLOCK_SIZE = 64 * 1024;
61 static const int BLOCK_HEADER_LENGTH = 18;
62 static const int BLOCK_FOOTER_LENGTH = 8;
64 static const int GZIP_ID1 = 31;
65 static const int GZIP_ID2 = 139;
66 static const int CM_DEFLATE = 8;
67 static const int FLG_FEXTRA = 4;
68 static const int OS_UNKNOWN = 255;
69 static const int BGZF_ID1 = 66; // 'B'
70 static const int BGZF_ID2 = 67; // 'C'
71 static const int BGZF_LEN = 2;
72 static const int BGZF_XLEN = 6; // BGZF_LEN+4
74 static const int GZIP_WINDOW_BITS = -15; // no zlib header
75 static const int Z_DEFAULT_MEM_LEVEL = 8;
80 packInt16(uint8_t* buffer, uint16_t value)
83 buffer[1] = value >> 8;
88 unpackInt16(const uint8_t* buffer)
90 return (buffer[0] | (buffer[1] << 8));
95 packInt32(uint8_t* buffer, uint32_t value)
98 buffer[1] = value >> 8;
99 buffer[2] = value >> 16;
100 buffer[3] = value >> 24;
105 bgzf_min(int x, int y)
107 return (x < y) ? x : y;
112 report_error(BGZF* fp, const char* message) {
116 int bgzf_check_bgzf(const char *fn)
119 uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377";
122 if ((fp = bgzf_open(fn, "r")) == 0)
124 fprintf(pysamerr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn);
129 n = knet_read(fp->x.fpr, buf, 10);
131 n = fread(buf, 1, 10, fp->file);
138 if ( !memcmp(magic, buf, 10) ) return 1;
142 static BGZF *bgzf_read_init()
145 fp = calloc(1, sizeof(BGZF));
146 fp->uncompressed_block_size = MAX_BLOCK_SIZE;
147 fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
148 fp->compressed_block_size = MAX_BLOCK_SIZE;
149 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
151 fp->cache = kh_init(cache);
160 knetFile *file = knet_dopen(fd, "r");
162 FILE* file = fdopen(fd, "r");
165 if (file == 0) return 0;
166 fp = bgzf_read_init();
167 fp->file_descriptor = fd;
179 open_write(int fd, int compress_level) // compress_level==-1 for the default level
181 FILE* file = fdopen(fd, "w");
183 if (file == 0) return 0;
184 fp = malloc(sizeof(BGZF));
185 fp->file_descriptor = fd;
188 fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
189 if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
195 fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
196 fp->uncompressed_block = NULL;
197 fp->compressed_block_size = MAX_BLOCK_SIZE;
198 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
199 fp->block_address = 0;
200 fp->block_offset = 0;
201 fp->block_length = 0;
207 bgzf_open(const char* __restrict path, const char* __restrict mode)
210 if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */
212 knetFile *file = knet_open(path, mode);
213 if (file == 0) return 0;
214 fp = bgzf_read_init();
215 fp->file_descriptor = -1;
219 int fd, oflag = O_RDONLY;
223 fd = open(path, oflag);
224 if (fd == -1) return 0;
227 } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
228 int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC;
232 fd = open(path, oflag, 0666);
233 if (fd == -1) return 0;
234 { // set compress_level
236 for (i = 0; mode[i]; ++i)
237 if (mode[i] >= '0' && mode[i] <= '9') break;
238 if (mode[i]) compress_level = (int)mode[i] - '0';
239 if (strchr(mode, 'u')) compress_level = 0;
241 fp = open_write(fd, compress_level);
243 if (fp != NULL) fp->owned_file = 1;
248 bgzf_fdopen(int fd, const char * __restrict mode)
250 if (fd == -1) return 0;
251 if (mode[0] == 'r' || mode[0] == 'R') {
252 return open_read(fd);
253 } else if (mode[0] == 'w' || mode[0] == 'W') {
254 int i, compress_level = -1;
255 for (i = 0; mode[i]; ++i)
256 if (mode[i] >= '0' && mode[i] <= '9') break;
257 if (mode[i]) compress_level = (int)mode[i] - '0';
258 if (strchr(mode, 'u')) compress_level = 0;
259 return open_write(fd, compress_level);
267 deflate_block(BGZF* fp, int block_length)
269 // Deflate the block in fp->uncompressed_block into fp->compressed_block.
270 // Also adds an extra field that stores the compressed block length.
272 bgzf_byte_t* buffer = fp->compressed_block;
273 int buffer_size = fp->compressed_block_size;
276 buffer[0] = GZIP_ID1;
277 buffer[1] = GZIP_ID2;
278 buffer[2] = CM_DEFLATE;
279 buffer[3] = FLG_FEXTRA;
280 buffer[4] = 0; // mtime
285 buffer[9] = OS_UNKNOWN;
286 buffer[10] = BGZF_XLEN;
288 buffer[12] = BGZF_ID1;
289 buffer[13] = BGZF_ID2;
290 buffer[14] = BGZF_LEN;
292 buffer[16] = 0; // placeholder for block length
295 // loop to retry for blocks that do not compress enough
296 int input_length = block_length;
297 int compressed_length = 0;
302 zs.next_in = fp->uncompressed_block;
303 zs.avail_in = input_length;
304 zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
305 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
307 int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED,
308 GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
309 if (status != Z_OK) {
310 report_error(fp, "deflate init failed");
313 status = deflate(&zs, Z_FINISH);
314 if (status != Z_STREAM_END) {
316 if (status == Z_OK) {
317 // Not enough space in buffer.
318 // Can happen in the rare case the input doesn't compress enough.
319 // Reduce the amount of input until it fits.
320 input_length -= 1024;
321 if (input_length <= 0) {
322 // should never happen
323 report_error(fp, "input reduction failed");
328 report_error(fp, "deflate failed");
331 status = deflateEnd(&zs);
332 if (status != Z_OK) {
333 report_error(fp, "deflate end failed");
336 compressed_length = zs.total_out;
337 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
338 if (compressed_length > MAX_BLOCK_SIZE) {
339 // should never happen
340 report_error(fp, "deflate overflow");
346 packInt16((uint8_t*)&buffer[16], compressed_length-1);
347 uint32_t crc = crc32(0L, NULL, 0L);
348 crc = crc32(crc, fp->uncompressed_block, input_length);
349 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
350 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
352 int remaining = block_length - input_length;
354 if (remaining > input_length) {
355 // should never happen (check so we can use memcpy)
356 report_error(fp, "remainder too large");
359 memcpy(fp->uncompressed_block,
360 fp->uncompressed_block + input_length,
363 fp->block_offset = remaining;
364 return compressed_length;
369 inflate_block(BGZF* fp, int block_length)
371 // Inflate the block in fp->compressed_block into fp->uncompressed_block
377 zs.next_in = fp->compressed_block + 18;
378 zs.avail_in = block_length - 16;
379 zs.next_out = fp->uncompressed_block;
380 zs.avail_out = fp->uncompressed_block_size;
382 status = inflateInit2(&zs, GZIP_WINDOW_BITS);
383 if (status != Z_OK) {
384 report_error(fp, "inflate init failed");
387 status = inflate(&zs, Z_FINISH);
388 if (status != Z_STREAM_END) {
390 report_error(fp, "inflate failed");
393 status = inflateEnd(&zs);
394 if (status != Z_OK) {
395 report_error(fp, "inflate failed");
403 check_header(const bgzf_byte_t* header)
405 return (header[0] == GZIP_ID1 &&
406 header[1] == (bgzf_byte_t) GZIP_ID2 &&
407 header[2] == Z_DEFLATED &&
408 (header[3] & FLG_FEXTRA) != 0 &&
409 unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
410 header[12] == BGZF_ID1 &&
411 header[13] == BGZF_ID2 &&
412 unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
415 static void free_cache(BGZF *fp)
418 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
419 if (fp->open_mode != 'r') return;
420 for (k = kh_begin(h); k < kh_end(h); ++k)
421 if (kh_exist(h, k)) free(kh_val(h, k).block);
422 kh_destroy(cache, h);
425 static int load_block_from_cache(BGZF *fp, int64_t block_address)
429 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
430 k = kh_get(cache, h, block_address);
431 if (k == kh_end(h)) return 0;
433 if (fp->block_length != 0) fp->block_offset = 0;
434 fp->block_address = block_address;
435 fp->block_length = p->size;
436 memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
438 knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
440 fseeko(fp->file, p->end_offset, SEEK_SET);
445 static void cache_block(BGZF *fp, int size)
450 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
451 if (MAX_BLOCK_SIZE >= fp->cache_size) return;
452 if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
453 /* A better way would be to remove the oldest block in the
454 * cache, but here we remove a random one for simplicity. This
455 * should not have a big impact on performance. */
456 for (k = kh_begin(h); k < kh_end(h); ++k)
457 if (kh_exist(h, k)) break;
459 free(kh_val(h, k).block);
463 k = kh_put(cache, h, fp->block_address, &ret);
464 if (ret == 0) return; // if this happens, a bug!
466 p->size = fp->block_length;
467 p->end_offset = fp->block_address + size;
468 p->block = malloc(MAX_BLOCK_SIZE);
469 memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
473 bgzf_read_block(BGZF* fp)
475 bgzf_byte_t header[BLOCK_HEADER_LENGTH];
476 int count, size = 0, block_length, remaining;
478 int64_t block_address = knet_tell(fp->x.fpr);
479 if (load_block_from_cache(fp, block_address)) return 0;
480 count = knet_read(fp->x.fpr, header, sizeof(header));
482 int64_t block_address = ftello(fp->file);
483 if (load_block_from_cache(fp, block_address)) return 0;
484 count = fread(header, 1, sizeof(header), fp->file);
487 fp->block_length = 0;
491 if (count != sizeof(header)) {
492 report_error(fp, "read failed");
495 if (!check_header(header)) {
496 report_error(fp, "invalid block header");
499 block_length = unpackInt16((uint8_t*)&header[16]) + 1;
500 bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
501 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
502 remaining = block_length - BLOCK_HEADER_LENGTH;
504 count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
506 count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
508 if (count != remaining) {
509 report_error(fp, "read failed");
513 count = inflate_block(fp, block_length);
514 if (count < 0) return -1;
515 if (fp->block_length != 0) {
516 // Do not reset offset if this read follows a seek.
517 fp->block_offset = 0;
519 fp->block_address = block_address;
520 fp->block_length = count;
521 cache_block(fp, size);
526 bgzf_read(BGZF* fp, void* data, int length)
531 if (fp->open_mode != 'r') {
532 report_error(fp, "file not open for reading");
537 bgzf_byte_t* output = data;
538 while (bytes_read < length) {
539 int copy_length, available = fp->block_length - fp->block_offset;
541 if (available <= 0) {
542 if (bgzf_read_block(fp) != 0) {
545 available = fp->block_length - fp->block_offset;
546 if (available <= 0) {
550 copy_length = bgzf_min(length-bytes_read, available);
551 buffer = fp->uncompressed_block;
552 memcpy(output, buffer + fp->block_offset, copy_length);
553 fp->block_offset += copy_length;
554 output += copy_length;
555 bytes_read += copy_length;
557 if (fp->block_offset == fp->block_length) {
559 fp->block_address = knet_tell(fp->x.fpr);
561 fp->block_address = ftello(fp->file);
563 fp->block_offset = 0;
564 fp->block_length = 0;
569 int bgzf_flush(BGZF* fp)
571 while (fp->block_offset > 0) {
572 int count, block_length;
573 block_length = deflate_block(fp, fp->block_offset);
574 if (block_length < 0) return -1;
576 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
578 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
580 if (count != block_length) {
581 report_error(fp, "write failed");
584 fp->block_address += block_length;
589 int bgzf_flush_try(BGZF *fp, int size)
591 if (fp->block_offset + size > fp->uncompressed_block_size)
592 return bgzf_flush(fp);
596 int bgzf_write(BGZF* fp, const void* data, int length)
598 const bgzf_byte_t *input = data;
599 int block_length, bytes_written;
600 if (fp->open_mode != 'w') {
601 report_error(fp, "file not open for writing");
605 if (fp->uncompressed_block == NULL)
606 fp->uncompressed_block = malloc(fp->uncompressed_block_size);
609 block_length = fp->uncompressed_block_size;
611 while (bytes_written < length) {
612 int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
613 bgzf_byte_t* buffer = fp->uncompressed_block;
614 memcpy(buffer + fp->block_offset, input, copy_length);
615 fp->block_offset += copy_length;
616 input += copy_length;
617 bytes_written += copy_length;
618 if (fp->block_offset == block_length) {
619 if (bgzf_flush(fp) != 0) {
624 return bytes_written;
627 int bgzf_close(BGZF* fp)
629 if (fp->open_mode == 'w') {
630 if (bgzf_flush(fp) != 0) return -1;
631 { // add an empty block
632 int count, block_length = deflate_block(fp, 0);
634 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
636 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
640 if (fflush(fp->x.fpw) != 0) {
642 if (fflush(fp->file) != 0) {
644 report_error(fp, "flush failed");
648 if (fp->owned_file) {
651 if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
652 else ret = knet_close(fp->x.fpr);
653 if (ret != 0) return -1;
655 if (fclose(fp->file) != 0) return -1;
658 free(fp->uncompressed_block);
659 free(fp->compressed_block);
665 void bgzf_set_cache_size(BGZF *fp, int cache_size)
667 if (fp) fp->cache_size = cache_size;
670 int bgzf_check_EOF(BGZF *fp)
672 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
676 offset = knet_tell(fp->x.fpr);
677 if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
678 knet_read(fp->x.fpr, buf, 28);
679 knet_seek(fp->x.fpr, offset, SEEK_SET);
681 offset = ftello(fp->file);
682 if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
683 fread(buf, 1, 28, fp->file);
684 fseeko(fp->file, offset, SEEK_SET);
686 return (memcmp(magic, buf, 28) == 0)? 1 : 0;
689 int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
692 int64_t block_address;
694 if (fp->open_mode != 'r') {
695 report_error(fp, "file not open for read");
698 if (where != SEEK_SET) {
699 report_error(fp, "unimplemented seek option");
702 block_offset = pos & 0xFFFF;
703 block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
705 if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
707 if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
709 report_error(fp, "seek failed");
712 fp->block_length = 0; // indicates current block is not loaded
713 fp->block_address = block_address;
714 fp->block_offset = block_offset;