3 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 2009-06-29 by lh3: cache recent uncompressed blocks.
26 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
27 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
34 #include <sys/types.h>
44 KHASH_MAP_INIT_INT64(cache, cache_t)
46 #if defined(_WIN32) || defined(_MSC_VER)
47 #define ftello(fp) ftell(fp)
48 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
50 extern off_t ftello(FILE *stream);
51 extern int fseeko(FILE *stream, off_t offset, int whence);
54 typedef int8_t bgzf_byte_t;
56 static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
57 static const int MAX_BLOCK_SIZE = 64 * 1024;
59 static const int BLOCK_HEADER_LENGTH = 18;
60 static const int BLOCK_FOOTER_LENGTH = 8;
62 static const int GZIP_ID1 = 31;
63 static const int GZIP_ID2 = 139;
64 static const int CM_DEFLATE = 8;
65 static const int FLG_FEXTRA = 4;
66 static const int OS_UNKNOWN = 255;
67 static const int BGZF_ID1 = 66; // 'B'
68 static const int BGZF_ID2 = 67; // 'C'
69 static const int BGZF_LEN = 2;
70 static const int BGZF_XLEN = 6; // BGZF_LEN+4
72 static const int GZIP_WINDOW_BITS = -15; // no zlib header
73 static const int Z_DEFAULT_MEM_LEVEL = 8;
78 packInt16(uint8_t* buffer, uint16_t value)
81 buffer[1] = value >> 8;
86 unpackInt16(const uint8_t* buffer)
88 return (buffer[0] | (buffer[1] << 8));
93 packInt32(uint8_t* buffer, uint32_t value)
96 buffer[1] = value >> 8;
97 buffer[2] = value >> 16;
98 buffer[3] = value >> 24;
103 bgzf_min(int x, int y)
105 return (x < y) ? x : y;
110 report_error(BGZF* fp, const char* message) {
114 static BGZF *bgzf_read_init()
117 fp = calloc(1, sizeof(BGZF));
118 fp->uncompressed_block_size = MAX_BLOCK_SIZE;
119 fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
120 fp->compressed_block_size = MAX_BLOCK_SIZE;
121 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
123 fp->cache = kh_init(cache);
132 knetFile *file = knet_dopen(fd, "r");
134 FILE* file = fdopen(fd, "r");
137 if (file == 0) return 0;
138 fp = bgzf_read_init();
139 fp->file_descriptor = fd;
151 open_write(int fd, bool is_uncompressed)
153 FILE* file = fdopen(fd, "w");
155 if (file == 0) return 0;
156 fp = malloc(sizeof(BGZF));
157 fp->file_descriptor = fd;
159 fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
165 fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
166 fp->uncompressed_block = NULL;
167 fp->compressed_block_size = MAX_BLOCK_SIZE;
168 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
169 fp->block_address = 0;
170 fp->block_offset = 0;
171 fp->block_length = 0;
177 bgzf_open(const char* __restrict path, const char* __restrict mode)
180 if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */
182 knetFile *file = knet_open(path, mode);
183 if (file == 0) return 0;
184 fp = bgzf_read_init();
185 fp->file_descriptor = -1;
189 int fd, oflag = O_RDONLY;
193 fd = open(path, oflag);
194 if (fd == -1) return 0;
197 } else if (mode[0] == 'w' || mode[0] == 'W') {
198 int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC;
202 fd = open(path, oflag, 0666);
203 if (fd == -1) return 0;
204 fp = open_write(fd, strstr(mode, "u")? 1 : 0);
213 bgzf_fdopen(int fd, const char * __restrict mode)
215 if (fd == -1) return 0;
216 if (mode[0] == 'r' || mode[0] == 'R') {
217 return open_read(fd);
218 } else if (mode[0] == 'w' || mode[0] == 'W') {
219 return open_write(fd, strstr(mode, "u")? 1 : 0);
227 deflate_block(BGZF* fp, int block_length)
229 // Deflate the block in fp->uncompressed_block into fp->compressed_block.
230 // Also adds an extra field that stores the compressed block length.
232 bgzf_byte_t* buffer = fp->compressed_block;
233 int buffer_size = fp->compressed_block_size;
236 buffer[0] = GZIP_ID1;
237 buffer[1] = GZIP_ID2;
238 buffer[2] = CM_DEFLATE;
239 buffer[3] = FLG_FEXTRA;
240 buffer[4] = 0; // mtime
245 buffer[9] = OS_UNKNOWN;
246 buffer[10] = BGZF_XLEN;
248 buffer[12] = BGZF_ID1;
249 buffer[13] = BGZF_ID2;
250 buffer[14] = BGZF_LEN;
252 buffer[16] = 0; // placeholder for block length
255 // loop to retry for blocks that do not compress enough
256 int input_length = block_length;
257 int compressed_length = 0;
259 int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
263 zs.next_in = fp->uncompressed_block;
264 zs.avail_in = input_length;
265 zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
266 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
268 int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
269 GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
270 if (status != Z_OK) {
271 report_error(fp, "deflate init failed");
274 status = deflate(&zs, Z_FINISH);
275 if (status != Z_STREAM_END) {
277 if (status == Z_OK) {
278 // Not enough space in buffer.
279 // Can happen in the rare case the input doesn't compress enough.
280 // Reduce the amount of input until it fits.
281 input_length -= 1024;
282 if (input_length <= 0) {
283 // should never happen
284 report_error(fp, "input reduction failed");
289 report_error(fp, "deflate failed");
292 status = deflateEnd(&zs);
293 if (status != Z_OK) {
294 report_error(fp, "deflate end failed");
297 compressed_length = zs.total_out;
298 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
299 if (compressed_length > MAX_BLOCK_SIZE) {
300 // should never happen
301 report_error(fp, "deflate overflow");
307 packInt16((uint8_t*)&buffer[16], compressed_length-1);
308 uint32_t crc = crc32(0L, NULL, 0L);
309 crc = crc32(crc, fp->uncompressed_block, input_length);
310 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
311 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
313 int remaining = block_length - input_length;
315 if (remaining > input_length) {
316 // should never happen (check so we can use memcpy)
317 report_error(fp, "remainder too large");
320 memcpy(fp->uncompressed_block,
321 fp->uncompressed_block + input_length,
324 fp->block_offset = remaining;
325 return compressed_length;
330 inflate_block(BGZF* fp, int block_length)
332 // Inflate the block in fp->compressed_block into fp->uncompressed_block
337 zs.next_in = fp->compressed_block + 18;
338 zs.avail_in = block_length - 16;
339 zs.next_out = fp->uncompressed_block;
340 zs.avail_out = fp->uncompressed_block_size;
342 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
343 if (status != Z_OK) {
344 report_error(fp, "inflate init failed");
347 status = inflate(&zs, Z_FINISH);
348 if (status != Z_STREAM_END) {
350 report_error(fp, "inflate failed");
353 status = inflateEnd(&zs);
354 if (status != Z_OK) {
355 report_error(fp, "inflate failed");
363 check_header(const bgzf_byte_t* header)
365 return (header[0] == GZIP_ID1 &&
366 header[1] == (bgzf_byte_t) GZIP_ID2 &&
367 header[2] == Z_DEFLATED &&
368 (header[3] & FLG_FEXTRA) != 0 &&
369 unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
370 header[12] == BGZF_ID1 &&
371 header[13] == BGZF_ID2 &&
372 unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
375 static void free_cache(BGZF *fp)
378 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
379 if (fp->open_mode != 'r') return;
380 for (k = kh_begin(h); k < kh_end(h); ++k)
381 if (kh_exist(h, k)) free(kh_val(h, k).block);
382 kh_destroy(cache, h);
385 static int load_block_from_cache(BGZF *fp, int64_t block_address)
389 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
390 k = kh_get(cache, h, block_address);
391 if (k == kh_end(h)) return 0;
393 if (fp->block_length != 0) fp->block_offset = 0;
394 fp->block_address = block_address;
395 fp->block_length = p->size;
396 memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
398 knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
400 fseeko(fp->file, p->end_offset, SEEK_SET);
405 static void cache_block(BGZF *fp, int size)
410 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
411 if (MAX_BLOCK_SIZE >= fp->cache_size) return;
412 if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
413 /* A better way would be to remove the oldest block in the
414 * cache, but here we remove a random one for simplicity. This
415 * should not have a big impact on performance. */
416 for (k = kh_begin(h); k < kh_end(h); ++k)
417 if (kh_exist(h, k)) break;
419 free(kh_val(h, k).block);
423 k = kh_put(cache, h, fp->block_address, &ret);
424 if (ret == 0) return; // if this happens, a bug!
426 p->size = fp->block_length;
427 p->end_offset = fp->block_address + size;
428 p->block = malloc(MAX_BLOCK_SIZE);
429 memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
433 bgzf_read_block(BGZF* fp)
435 bgzf_byte_t header[BLOCK_HEADER_LENGTH];
438 int64_t block_address = knet_tell(fp->x.fpr);
439 if (load_block_from_cache(fp, block_address)) return 0;
440 int count = knet_read(fp->x.fpr, header, sizeof(header));
442 int64_t block_address = ftello(fp->file);
443 if (load_block_from_cache(fp, block_address)) return 0;
444 int count = fread(header, 1, sizeof(header), fp->file);
447 fp->block_length = 0;
451 if (count != sizeof(header)) {
452 report_error(fp, "read failed");
455 if (!check_header(header)) {
456 report_error(fp, "invalid block header");
459 int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
460 bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
461 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
462 int remaining = block_length - BLOCK_HEADER_LENGTH;
464 count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
466 count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
468 if (count != remaining) {
469 report_error(fp, "read failed");
473 count = inflate_block(fp, block_length);
477 if (fp->block_length != 0) {
478 // Do not reset offset if this read follows a seek.
479 fp->block_offset = 0;
481 fp->block_address = block_address;
482 fp->block_length = count;
483 cache_block(fp, size);
488 bgzf_read(BGZF* fp, void* data, int length)
493 if (fp->open_mode != 'r') {
494 report_error(fp, "file not open for reading");
499 bgzf_byte_t* output = data;
500 while (bytes_read < length) {
501 int available = fp->block_length - fp->block_offset;
502 if (available <= 0) {
503 if (bgzf_read_block(fp) != 0) {
506 available = fp->block_length - fp->block_offset;
507 if (available <= 0) {
511 int copy_length = bgzf_min(length-bytes_read, available);
512 bgzf_byte_t* buffer = fp->uncompressed_block;
513 memcpy(output, buffer + fp->block_offset, copy_length);
514 fp->block_offset += copy_length;
515 output += copy_length;
516 bytes_read += copy_length;
518 if (fp->block_offset == fp->block_length) {
520 fp->block_address = knet_tell(fp->x.fpr);
522 fp->block_address = ftello(fp->file);
524 fp->block_offset = 0;
525 fp->block_length = 0;
532 flush_block(BGZF* fp)
534 while (fp->block_offset > 0) {
535 int block_length = deflate_block(fp, fp->block_offset);
536 if (block_length < 0) {
540 int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
542 int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
544 if (count != block_length) {
545 report_error(fp, "write failed");
548 fp->block_address += block_length;
554 bgzf_write(BGZF* fp, const void* data, int length)
556 if (fp->open_mode != 'w') {
557 report_error(fp, "file not open for writing");
561 if (fp->uncompressed_block == NULL) {
562 fp->uncompressed_block = malloc(fp->uncompressed_block_size);
565 const bgzf_byte_t* input = data;
566 int block_length = fp->uncompressed_block_size;
567 int bytes_written = 0;
568 while (bytes_written < length) {
569 int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
570 bgzf_byte_t* buffer = fp->uncompressed_block;
571 memcpy(buffer + fp->block_offset, input, copy_length);
572 fp->block_offset += copy_length;
573 input += copy_length;
574 bytes_written += copy_length;
575 if (fp->block_offset == block_length) {
576 if (flush_block(fp) != 0) {
581 return bytes_written;
587 if (fp->open_mode == 'w') {
588 if (flush_block(fp) != 0) {
591 { // add an empty block
592 int count, block_length = deflate_block(fp, 0);
594 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
596 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
600 if (fflush(fp->x.fpw) != 0) {
602 if (fflush(fp->file) != 0) {
604 report_error(fp, "flush failed");
608 if (fp->owned_file) {
611 if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
612 else ret = knet_close(fp->x.fpr);
613 if (ret != 0) return -1;
615 if (fclose(fp->file) != 0) {
620 free(fp->uncompressed_block);
621 free(fp->compressed_block);
627 void bgzf_set_cache_size(BGZF *fp, int cache_size)
629 if (fp) fp->cache_size = cache_size;
632 int bgzf_check_EOF(BGZF *fp)
634 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
638 offset = knet_tell(fp->x.fpr);
639 if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
640 knet_read(fp->x.fpr, buf, 28);
641 knet_seek(fp->x.fpr, offset, SEEK_SET);
643 offset = ftello(fp->file);
644 if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
645 fread(buf, 1, 28, fp->file);
646 fseeko(fp->file, offset, SEEK_SET);
648 return (memcmp(magic, buf, 28) == 0)? 1 : 0;
652 bgzf_seek(BGZF* fp, int64_t pos, int where)
654 if (fp->open_mode != 'r') {
655 report_error(fp, "file not open for read");
658 if (where != SEEK_SET) {
659 report_error(fp, "unimplemented seek option");
662 int block_offset = pos & 0xFFFF;
663 int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
665 if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
667 if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
669 report_error(fp, "seek failed");
672 fp->block_length = 0; // indicates current block is not loaded
673 fp->block_address = block_address;
674 fp->block_offset = block_offset;