3 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 2009-06-29 by lh3: cache recent uncompressed blocks.
26 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
27 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
34 #include <sys/types.h>
44 KHASH_MAP_INIT_INT64(cache, cache_t)
46 #if defined(_WIN32) || defined(_MSC_VER)
47 #define ftello(fp) ftell(fp)
48 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
50 extern off_t ftello(FILE *stream);
51 extern int fseeko(FILE *stream, off_t offset, int whence);
54 typedef int8_t bgzf_byte_t;
56 static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
57 static const int MAX_BLOCK_SIZE = 64 * 1024;
59 static const int BLOCK_HEADER_LENGTH = 18;
60 static const int BLOCK_FOOTER_LENGTH = 8;
62 static const int GZIP_ID1 = 31;
63 static const int GZIP_ID2 = 139;
64 static const int CM_DEFLATE = 8;
65 static const int FLG_FEXTRA = 4;
66 static const int OS_UNKNOWN = 255;
67 static const int BGZF_ID1 = 66; // 'B'
68 static const int BGZF_ID2 = 67; // 'C'
69 static const int BGZF_LEN = 2;
70 static const int BGZF_XLEN = 6; // BGZF_LEN+4
72 static const int GZIP_WINDOW_BITS = -15; // no zlib header
73 static const int Z_DEFAULT_MEM_LEVEL = 8;
78 packInt16(uint8_t* buffer, uint16_t value)
81 buffer[1] = value >> 8;
86 unpackInt16(const uint8_t* buffer)
88 return (buffer[0] | (buffer[1] << 8));
93 packInt32(uint8_t* buffer, uint32_t value)
96 buffer[1] = value >> 8;
97 buffer[2] = value >> 16;
98 buffer[3] = value >> 24;
103 bgzf_min(int x, int y)
105 return (x < y) ? x : y;
110 report_error(BGZF* fp, const char* message) {
114 int is_bgzipped(const char *fn)
117 uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377";
120 if ((fp = bgzf_open(fn, "r")) == 0)
122 fprintf(stderr, "[is_bgzipped] failed to open the file: %s\n",fn);
127 n = knet_read(fp->x.fpr, buf, 10);
129 n = fread(buf, 1, 10, fp->file);
136 if ( !memcmp(magic, buf, 10) ) return 1;
140 static BGZF *bgzf_read_init()
143 fp = calloc(1, sizeof(BGZF));
144 fp->uncompressed_block_size = MAX_BLOCK_SIZE;
145 fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
146 fp->compressed_block_size = MAX_BLOCK_SIZE;
147 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
149 fp->cache = kh_init(cache);
158 knetFile *file = knet_dopen(fd, "r");
160 FILE* file = fdopen(fd, "r");
163 if (file == 0) return 0;
164 fp = bgzf_read_init();
165 fp->file_descriptor = fd;
177 open_write(int fd, bool is_uncompressed)
179 FILE* file = fdopen(fd, "w");
181 if (file == 0) return 0;
182 fp = malloc(sizeof(BGZF));
183 fp->file_descriptor = fd;
185 fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
191 fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
192 fp->uncompressed_block = NULL;
193 fp->compressed_block_size = MAX_BLOCK_SIZE;
194 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
195 fp->block_address = 0;
196 fp->block_offset = 0;
197 fp->block_length = 0;
203 bgzf_open(const char* __restrict path, const char* __restrict mode)
206 if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */
208 knetFile *file = knet_open(path, mode);
209 if (file == 0) return 0;
210 fp = bgzf_read_init();
211 fp->file_descriptor = -1;
215 int fd, oflag = O_RDONLY;
219 fd = open(path, oflag);
220 if (fd == -1) return 0;
223 } else if (mode[0] == 'w' || mode[0] == 'W') {
224 int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC;
228 fd = open(path, oflag, 0666);
229 if (fd == -1) return 0;
230 fp = open_write(fd, strstr(mode, "u")? 1 : 0);
239 bgzf_fdopen(int fd, const char * __restrict mode)
241 if (fd == -1) return 0;
242 if (mode[0] == 'r' || mode[0] == 'R') {
243 return open_read(fd);
244 } else if (mode[0] == 'w' || mode[0] == 'W') {
245 return open_write(fd, strstr(mode, "u")? 1 : 0);
253 deflate_block(BGZF* fp, int block_length)
255 // Deflate the block in fp->uncompressed_block into fp->compressed_block.
256 // Also adds an extra field that stores the compressed block length.
258 bgzf_byte_t* buffer = fp->compressed_block;
259 int buffer_size = fp->compressed_block_size;
262 buffer[0] = GZIP_ID1;
263 buffer[1] = GZIP_ID2;
264 buffer[2] = CM_DEFLATE;
265 buffer[3] = FLG_FEXTRA;
266 buffer[4] = 0; // mtime
271 buffer[9] = OS_UNKNOWN;
272 buffer[10] = BGZF_XLEN;
274 buffer[12] = BGZF_ID1;
275 buffer[13] = BGZF_ID2;
276 buffer[14] = BGZF_LEN;
278 buffer[16] = 0; // placeholder for block length
281 // loop to retry for blocks that do not compress enough
282 int input_length = block_length;
283 int compressed_length = 0;
285 int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
289 zs.next_in = fp->uncompressed_block;
290 zs.avail_in = input_length;
291 zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
292 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
294 int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
295 GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
296 if (status != Z_OK) {
297 report_error(fp, "deflate init failed");
300 status = deflate(&zs, Z_FINISH);
301 if (status != Z_STREAM_END) {
303 if (status == Z_OK) {
304 // Not enough space in buffer.
305 // Can happen in the rare case the input doesn't compress enough.
306 // Reduce the amount of input until it fits.
307 input_length -= 1024;
308 if (input_length <= 0) {
309 // should never happen
310 report_error(fp, "input reduction failed");
315 report_error(fp, "deflate failed");
318 status = deflateEnd(&zs);
319 if (status != Z_OK) {
320 report_error(fp, "deflate end failed");
323 compressed_length = zs.total_out;
324 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
325 if (compressed_length > MAX_BLOCK_SIZE) {
326 // should never happen
327 report_error(fp, "deflate overflow");
333 packInt16((uint8_t*)&buffer[16], compressed_length-1);
334 uint32_t crc = crc32(0L, NULL, 0L);
335 crc = crc32(crc, fp->uncompressed_block, input_length);
336 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
337 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
339 int remaining = block_length - input_length;
341 if (remaining > input_length) {
342 // should never happen (check so we can use memcpy)
343 report_error(fp, "remainder too large");
346 memcpy(fp->uncompressed_block,
347 fp->uncompressed_block + input_length,
350 fp->block_offset = remaining;
351 return compressed_length;
356 inflate_block(BGZF* fp, int block_length)
358 // Inflate the block in fp->compressed_block into fp->uncompressed_block
363 zs.next_in = fp->compressed_block + 18;
364 zs.avail_in = block_length - 16;
365 zs.next_out = fp->uncompressed_block;
366 zs.avail_out = fp->uncompressed_block_size;
368 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
369 if (status != Z_OK) {
370 report_error(fp, "inflate init failed");
373 status = inflate(&zs, Z_FINISH);
374 if (status != Z_STREAM_END) {
376 report_error(fp, "inflate failed");
379 status = inflateEnd(&zs);
380 if (status != Z_OK) {
381 report_error(fp, "inflate failed");
389 check_header(const bgzf_byte_t* header)
391 return (header[0] == GZIP_ID1 &&
392 header[1] == (bgzf_byte_t) GZIP_ID2 &&
393 header[2] == Z_DEFLATED &&
394 (header[3] & FLG_FEXTRA) != 0 &&
395 unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
396 header[12] == BGZF_ID1 &&
397 header[13] == BGZF_ID2 &&
398 unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
401 static void free_cache(BGZF *fp)
404 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
405 if (fp->open_mode != 'r') return;
406 for (k = kh_begin(h); k < kh_end(h); ++k)
407 if (kh_exist(h, k)) free(kh_val(h, k).block);
408 kh_destroy(cache, h);
411 static int load_block_from_cache(BGZF *fp, int64_t block_address)
415 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
416 k = kh_get(cache, h, block_address);
417 if (k == kh_end(h)) return 0;
419 if (fp->block_length != 0) fp->block_offset = 0;
420 fp->block_address = block_address;
421 fp->block_length = p->size;
422 memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
424 knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
426 fseeko(fp->file, p->end_offset, SEEK_SET);
431 static void cache_block(BGZF *fp, int size)
436 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
437 if (MAX_BLOCK_SIZE >= fp->cache_size) return;
438 if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
439 /* A better way would be to remove the oldest block in the
440 * cache, but here we remove a random one for simplicity. This
441 * should not have a big impact on performance. */
442 for (k = kh_begin(h); k < kh_end(h); ++k)
443 if (kh_exist(h, k)) break;
445 free(kh_val(h, k).block);
449 k = kh_put(cache, h, fp->block_address, &ret);
450 if (ret == 0) return; // if this happens, a bug!
452 p->size = fp->block_length;
453 p->end_offset = fp->block_address + size;
454 p->block = malloc(MAX_BLOCK_SIZE);
455 memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
459 bgzf_read_block(BGZF* fp)
461 bgzf_byte_t header[BLOCK_HEADER_LENGTH];
464 int64_t block_address = knet_tell(fp->x.fpr);
465 if (load_block_from_cache(fp, block_address)) return 0;
466 int count = knet_read(fp->x.fpr, header, sizeof(header));
468 int64_t block_address = ftello(fp->file);
469 if (load_block_from_cache(fp, block_address)) return 0;
470 int count = fread(header, 1, sizeof(header), fp->file);
473 fp->block_length = 0;
477 if (count != sizeof(header)) {
478 report_error(fp, "read failed");
481 if (!check_header(header)) {
482 report_error(fp, "invalid block header");
485 int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
486 bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
487 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
488 int remaining = block_length - BLOCK_HEADER_LENGTH;
490 count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
492 count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
494 if (count != remaining) {
495 report_error(fp, "read failed");
499 count = inflate_block(fp, block_length);
503 if (fp->block_length != 0) {
504 // Do not reset offset if this read follows a seek.
505 fp->block_offset = 0;
507 fp->block_address = block_address;
508 fp->block_length = count;
509 cache_block(fp, size);
514 bgzf_read(BGZF* fp, void* data, int length)
519 if (fp->open_mode != 'r') {
520 report_error(fp, "file not open for reading");
525 bgzf_byte_t* output = data;
526 while (bytes_read < length) {
527 int available = fp->block_length - fp->block_offset;
528 if (available <= 0) {
529 if (bgzf_read_block(fp) != 0) {
532 available = fp->block_length - fp->block_offset;
533 if (available <= 0) {
537 int copy_length = bgzf_min(length-bytes_read, available);
538 bgzf_byte_t* buffer = fp->uncompressed_block;
539 memcpy(output, buffer + fp->block_offset, copy_length);
540 fp->block_offset += copy_length;
541 output += copy_length;
542 bytes_read += copy_length;
544 if (fp->block_offset == fp->block_length) {
546 fp->block_address = knet_tell(fp->x.fpr);
548 fp->block_address = ftello(fp->file);
550 fp->block_offset = 0;
551 fp->block_length = 0;
558 flush_block(BGZF* fp)
560 while (fp->block_offset > 0) {
561 int block_length = deflate_block(fp, fp->block_offset);
562 if (block_length < 0) {
566 int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
568 int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
570 if (count != block_length) {
571 report_error(fp, "write failed");
574 fp->block_address += block_length;
580 bgzf_write(BGZF* fp, const void* data, int length)
582 if (fp->open_mode != 'w') {
583 report_error(fp, "file not open for writing");
587 if (fp->uncompressed_block == NULL) {
588 fp->uncompressed_block = malloc(fp->uncompressed_block_size);
591 const bgzf_byte_t* input = data;
592 int block_length = fp->uncompressed_block_size;
593 int bytes_written = 0;
594 while (bytes_written < length) {
595 int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
596 bgzf_byte_t* buffer = fp->uncompressed_block;
597 memcpy(buffer + fp->block_offset, input, copy_length);
598 fp->block_offset += copy_length;
599 input += copy_length;
600 bytes_written += copy_length;
601 if (fp->block_offset == block_length) {
602 if (flush_block(fp) != 0) {
607 return bytes_written;
613 if (fp->open_mode == 'w') {
614 if (flush_block(fp) != 0) {
617 { // add an empty block
618 int count, block_length = deflate_block(fp, 0);
620 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
622 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
626 if (fflush(fp->x.fpw) != 0) {
628 if (fflush(fp->file) != 0) {
630 report_error(fp, "flush failed");
634 if (fp->owned_file) {
637 if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
638 else ret = knet_close(fp->x.fpr);
639 if (ret != 0) return -1;
641 if (fclose(fp->file) != 0) {
646 free(fp->uncompressed_block);
647 free(fp->compressed_block);
653 void bgzf_set_cache_size(BGZF *fp, int cache_size)
655 if (fp) fp->cache_size = cache_size;
658 int bgzf_check_EOF(BGZF *fp)
660 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
664 offset = knet_tell(fp->x.fpr);
665 if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
666 knet_read(fp->x.fpr, buf, 28);
667 knet_seek(fp->x.fpr, offset, SEEK_SET);
669 offset = ftello(fp->file);
670 if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
671 fread(buf, 1, 28, fp->file);
672 fseeko(fp->file, offset, SEEK_SET);
674 return (memcmp(magic, buf, 28) == 0)? 1 : 0;
678 bgzf_seek(BGZF* fp, int64_t pos, int where)
680 if (fp->open_mode != 'r') {
681 report_error(fp, "file not open for read");
684 if (where != SEEK_SET) {
685 report_error(fp, "unimplemented seek option");
688 int block_offset = pos & 0xFFFF;
689 int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
691 if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
693 if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
695 report_error(fp, "seek failed");
698 fp->block_length = 0; // indicates current block is not loaded
699 fp->block_address = block_address;
700 fp->block_offset = block_offset;