X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=pysam.git;a=blobdiff_plain;f=samtools%2Fbgzf.c.pysam.c;fp=samtools%2Fbgzf.c.pysam.c;h=067b89d1950d0c677626d77eb608a701bb422b2e;hp=0000000000000000000000000000000000000000;hb=d02fe5283ed7a93a2f76a5d6dc6e37b40c11b9b1;hpb=d828f9c9aa78e3d1687265b52de841f3f3852089 diff --git a/samtools/bgzf.c.pysam.c b/samtools/bgzf.c.pysam.c new file mode 100644 index 0000000..067b89d --- /dev/null +++ b/samtools/bgzf.c.pysam.c @@ -0,0 +1,716 @@ +#include "pysam.h" + +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/* + 2009-06-29 by lh3: cache recent uncompressed blocks. + 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP. + 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */ + +#include +#include +#include +#include +#include +#include +#include +#include "bgzf.h" + +#include "khash.h" +typedef struct { + int size; + uint8_t *block; + int64_t end_offset; +} cache_t; +KHASH_MAP_INIT_INT64(cache, cache_t) + +#if defined(_WIN32) || defined(_MSC_VER) +#define ftello(fp) ftell(fp) +#define fseeko(fp, offset, whence) fseek(fp, offset, whence) +#else +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); +#endif + +typedef int8_t bgzf_byte_t; + +static const int DEFAULT_BLOCK_SIZE = 64 * 1024; +static const int MAX_BLOCK_SIZE = 64 * 1024; + +static const int BLOCK_HEADER_LENGTH = 18; +static const int BLOCK_FOOTER_LENGTH = 8; + +static const int GZIP_ID1 = 31; +static const int GZIP_ID2 = 139; +static const int CM_DEFLATE = 8; +static const int FLG_FEXTRA = 4; +static const int OS_UNKNOWN = 255; +static const int BGZF_ID1 = 66; // 'B' +static const int BGZF_ID2 = 67; // 'C' +static const int BGZF_LEN = 2; +static const int BGZF_XLEN = 6; // BGZF_LEN+4 + +static const int GZIP_WINDOW_BITS = -15; // no zlib header +static const int Z_DEFAULT_MEM_LEVEL = 8; + + +inline +void +packInt16(uint8_t* buffer, uint16_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; +} + +inline +int +unpackInt16(const uint8_t* buffer) +{ + return (buffer[0] | (buffer[1] << 8)); +} + +inline +void +packInt32(uint8_t* buffer, uint32_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; + buffer[2] = value >> 16; + buffer[3] = value >> 24; +} + +static inline +int +bgzf_min(int x, int y) +{ + return (x < y) ? x : y; +} + +static +void +report_error(BGZF* fp, const char* message) { + fp->error = message; +} + +int bgzf_check_bgzf(const char *fn) +{ + BGZF *fp; + uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377"; + int n; + + if ((fp = bgzf_open(fn, "r")) == 0) + { + fprintf(pysamerr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn); + return -1; + } + +#ifdef _USE_KNETFILE + n = knet_read(fp->x.fpr, buf, 10); +#else + n = fread(buf, 1, 10, fp->file); +#endif + bgzf_close(fp); + + if ( n!=10 ) + return -1; + + if ( !memcmp(magic, buf, 10) ) return 1; + return 0; +} + +static BGZF *bgzf_read_init() +{ + BGZF *fp; + fp = calloc(1, sizeof(BGZF)); + fp->uncompressed_block_size = MAX_BLOCK_SIZE; + fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->cache_size = 0; + fp->cache = kh_init(cache); + return fp; +} + +static +BGZF* +open_read(int fd) +{ +#ifdef _USE_KNETFILE + knetFile *file = knet_dopen(fd, "r"); +#else + FILE* file = fdopen(fd, "r"); +#endif + BGZF* fp; + if (file == 0) return 0; + fp = bgzf_read_init(); + fp->file_descriptor = fd; + fp->open_mode = 'r'; +#ifdef _USE_KNETFILE + fp->x.fpr = file; +#else + fp->file = file; +#endif + return fp; +} + +static +BGZF* +open_write(int fd, int compress_level) // compress_level==-1 for the default level +{ + FILE* file = fdopen(fd, "w"); + BGZF* fp; + if (file == 0) return 0; + fp = malloc(sizeof(BGZF)); + fp->file_descriptor = fd; + fp->open_mode = 'w'; + fp->owned_file = 0; + fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 + if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; +#ifdef _USE_KNETFILE + fp->x.fpw = file; +#else + fp->file = file; +#endif + fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE; + fp->uncompressed_block = NULL; + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->block_address = 0; + fp->block_offset = 0; + fp->block_length = 0; + fp->error = NULL; + return fp; +} + +BGZF* +bgzf_open(const char* __restrict path, const char* __restrict mode) +{ + BGZF* fp = NULL; + if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */ +#ifdef _USE_KNETFILE + knetFile *file = knet_open(path, mode); + if (file == 0) return 0; + fp = bgzf_read_init(); + fp->file_descriptor = -1; + fp->open_mode = 'r'; + fp->x.fpr = file; +#else + int fd, oflag = O_RDONLY; +#ifdef _WIN32 + oflag |= O_BINARY; +#endif + fd = open(path, oflag); + if (fd == -1) return 0; + fp = open_read(fd); +#endif + } else if (strchr(mode, 'w') || strchr(mode, 'W')) { + int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC; +#ifdef _WIN32 + oflag |= O_BINARY; +#endif + fd = open(path, oflag, 0666); + if (fd == -1) return 0; + { // set compress_level + int i; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + } + fp = open_write(fd, compress_level); + } + if (fp != NULL) fp->owned_file = 1; + return fp; +} + +BGZF* +bgzf_fdopen(int fd, const char * __restrict mode) +{ + if (fd == -1) return 0; + if (mode[0] == 'r' || mode[0] == 'R') { + return open_read(fd); + } else if (mode[0] == 'w' || mode[0] == 'W') { + int i, compress_level = -1; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + return open_write(fd, compress_level); + } else { + return NULL; + } +} + +static +int +deflate_block(BGZF* fp, int block_length) +{ + // Deflate the block in fp->uncompressed_block into fp->compressed_block. + // Also adds an extra field that stores the compressed block length. + + bgzf_byte_t* buffer = fp->compressed_block; + int buffer_size = fp->compressed_block_size; + + // Init gzip header + buffer[0] = GZIP_ID1; + buffer[1] = GZIP_ID2; + buffer[2] = CM_DEFLATE; + buffer[3] = FLG_FEXTRA; + buffer[4] = 0; // mtime + buffer[5] = 0; + buffer[6] = 0; + buffer[7] = 0; + buffer[8] = 0; + buffer[9] = OS_UNKNOWN; + buffer[10] = BGZF_XLEN; + buffer[11] = 0; + buffer[12] = BGZF_ID1; + buffer[13] = BGZF_ID2; + buffer[14] = BGZF_LEN; + buffer[15] = 0; + buffer[16] = 0; // placeholder for block length + buffer[17] = 0; + + // loop to retry for blocks that do not compress enough + int input_length = block_length; + int compressed_length = 0; + while (1) { + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->uncompressed_block; + zs.avail_in = input_length; + zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; + zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + + int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, + GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (status != Z_OK) { + report_error(fp, "deflate init failed"); + return -1; + } + status = deflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + deflateEnd(&zs); + if (status == Z_OK) { + // Not enough space in buffer. + // Can happen in the rare case the input doesn't compress enough. + // Reduce the amount of input until it fits. + input_length -= 1024; + if (input_length <= 0) { + // should never happen + report_error(fp, "input reduction failed"); + return -1; + } + continue; + } + report_error(fp, "deflate failed"); + return -1; + } + status = deflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "deflate end failed"); + return -1; + } + compressed_length = zs.total_out; + compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + if (compressed_length > MAX_BLOCK_SIZE) { + // should never happen + report_error(fp, "deflate overflow"); + return -1; + } + break; + } + + packInt16((uint8_t*)&buffer[16], compressed_length-1); + uint32_t crc = crc32(0L, NULL, 0L); + crc = crc32(crc, fp->uncompressed_block, input_length); + packInt32((uint8_t*)&buffer[compressed_length-8], crc); + packInt32((uint8_t*)&buffer[compressed_length-4], input_length); + + int remaining = block_length - input_length; + if (remaining > 0) { + if (remaining > input_length) { + // should never happen (check so we can use memcpy) + report_error(fp, "remainder too large"); + return -1; + } + memcpy(fp->uncompressed_block, + fp->uncompressed_block + input_length, + remaining); + } + fp->block_offset = remaining; + return compressed_length; +} + +static +int +inflate_block(BGZF* fp, int block_length) +{ + // Inflate the block in fp->compressed_block into fp->uncompressed_block + + z_stream zs; + int status; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->compressed_block + 18; + zs.avail_in = block_length - 16; + zs.next_out = fp->uncompressed_block; + zs.avail_out = fp->uncompressed_block_size; + + status = inflateInit2(&zs, GZIP_WINDOW_BITS); + if (status != Z_OK) { + report_error(fp, "inflate init failed"); + return -1; + } + status = inflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + inflateEnd(&zs); + report_error(fp, "inflate failed"); + return -1; + } + status = inflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "inflate failed"); + return -1; + } + return zs.total_out; +} + +static +int +check_header(const bgzf_byte_t* header) +{ + return (header[0] == GZIP_ID1 && + header[1] == (bgzf_byte_t) GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & FLG_FEXTRA) != 0 && + unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN && + header[12] == BGZF_ID1 && + header[13] == BGZF_ID2 && + unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); +} + +static void free_cache(BGZF *fp) +{ + khint_t k; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (fp->open_mode != 'r') return; + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) free(kh_val(h, k).block); + kh_destroy(cache, h); +} + +static int load_block_from_cache(BGZF *fp, int64_t block_address) +{ + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + k = kh_get(cache, h, block_address); + if (k == kh_end(h)) return 0; + p = &kh_val(h, k); + if (fp->block_length != 0) fp->block_offset = 0; + fp->block_address = block_address; + fp->block_length = p->size; + memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE); +#ifdef _USE_KNETFILE + knet_seek(fp->x.fpr, p->end_offset, SEEK_SET); +#else + fseeko(fp->file, p->end_offset, SEEK_SET); +#endif + return p->size; +} + +static void cache_block(BGZF *fp, int size) +{ + int ret; + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (MAX_BLOCK_SIZE >= fp->cache_size) return; + if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) { + /* A better way would be to remove the oldest block in the + * cache, but here we remove a random one for simplicity. This + * should not have a big impact on performance. */ + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) break; + if (k < kh_end(h)) { + free(kh_val(h, k).block); + kh_del(cache, h, k); + } + } + k = kh_put(cache, h, fp->block_address, &ret); + if (ret == 0) return; // if this happens, a bug! + p = &kh_val(h, k); + p->size = fp->block_length; + p->end_offset = fp->block_address + size; + p->block = malloc(MAX_BLOCK_SIZE); + memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); +} + +int +bgzf_read_block(BGZF* fp) +{ + bgzf_byte_t header[BLOCK_HEADER_LENGTH]; + int count, size = 0, block_length, remaining; +#ifdef _USE_KNETFILE + int64_t block_address = knet_tell(fp->x.fpr); + if (load_block_from_cache(fp, block_address)) return 0; + count = knet_read(fp->x.fpr, header, sizeof(header)); +#else + int64_t block_address = ftello(fp->file); + if (load_block_from_cache(fp, block_address)) return 0; + count = fread(header, 1, sizeof(header), fp->file); +#endif + if (count == 0) { + fp->block_length = 0; + return 0; + } + size = count; + if (count != sizeof(header)) { + report_error(fp, "read failed"); + return -1; + } + if (!check_header(header)) { + report_error(fp, "invalid block header"); + return -1; + } + block_length = unpackInt16((uint8_t*)&header[16]) + 1; + bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block; + memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); + remaining = block_length - BLOCK_HEADER_LENGTH; +#ifdef _USE_KNETFILE + count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); +#else + count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file); +#endif + if (count != remaining) { + report_error(fp, "read failed"); + return -1; + } + size += count; + count = inflate_block(fp, block_length); + if (count < 0) return -1; + if (fp->block_length != 0) { + // Do not reset offset if this read follows a seek. + fp->block_offset = 0; + } + fp->block_address = block_address; + fp->block_length = count; + cache_block(fp, size); + return 0; +} + +int +bgzf_read(BGZF* fp, void* data, int length) +{ + if (length <= 0) { + return 0; + } + if (fp->open_mode != 'r') { + report_error(fp, "file not open for reading"); + return -1; + } + + int bytes_read = 0; + bgzf_byte_t* output = data; + while (bytes_read < length) { + int copy_length, available = fp->block_length - fp->block_offset; + bgzf_byte_t *buffer; + if (available <= 0) { + if (bgzf_read_block(fp) != 0) { + return -1; + } + available = fp->block_length - fp->block_offset; + if (available <= 0) { + break; + } + } + copy_length = bgzf_min(length-bytes_read, available); + buffer = fp->uncompressed_block; + memcpy(output, buffer + fp->block_offset, copy_length); + fp->block_offset += copy_length; + output += copy_length; + bytes_read += copy_length; + } + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return bytes_read; +} + +int bgzf_flush(BGZF* fp) +{ + while (fp->block_offset > 0) { + int count, block_length; + block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) return -1; +#ifdef _USE_KNETFILE + count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); +#else + count = fwrite(fp->compressed_block, 1, block_length, fp->file); +#endif + if (count != block_length) { + report_error(fp, "write failed"); + return -1; + } + fp->block_address += block_length; + } + return 0; +} + +int bgzf_flush_try(BGZF *fp, int size) +{ + if (fp->block_offset + size > fp->uncompressed_block_size) + return bgzf_flush(fp); + return -1; +} + +int bgzf_write(BGZF* fp, const void* data, int length) +{ + const bgzf_byte_t *input = data; + int block_length, bytes_written; + if (fp->open_mode != 'w') { + report_error(fp, "file not open for writing"); + return -1; + } + + if (fp->uncompressed_block == NULL) + fp->uncompressed_block = malloc(fp->uncompressed_block_size); + + input = data; + block_length = fp->uncompressed_block_size; + bytes_written = 0; + while (bytes_written < length) { + int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written); + bgzf_byte_t* buffer = fp->uncompressed_block; + memcpy(buffer + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; + input += copy_length; + bytes_written += copy_length; + if (fp->block_offset == block_length) { + if (bgzf_flush(fp) != 0) { + break; + } + } + } + return bytes_written; +} + +int bgzf_close(BGZF* fp) +{ + if (fp->open_mode == 'w') { + if (bgzf_flush(fp) != 0) return -1; + { // add an empty block + int count, block_length = deflate_block(fp, 0); +#ifdef _USE_KNETFILE + count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); +#else + count = fwrite(fp->compressed_block, 1, block_length, fp->file); +#endif + } +#ifdef _USE_KNETFILE + if (fflush(fp->x.fpw) != 0) { +#else + if (fflush(fp->file) != 0) { +#endif + report_error(fp, "flush failed"); + return -1; + } + } + if (fp->owned_file) { +#ifdef _USE_KNETFILE + int ret; + if (fp->open_mode == 'w') ret = fclose(fp->x.fpw); + else ret = knet_close(fp->x.fpr); + if (ret != 0) return -1; +#else + if (fclose(fp->file) != 0) return -1; +#endif + } + free(fp->uncompressed_block); + free(fp->compressed_block); + free_cache(fp); + free(fp); + return 0; +} + +void bgzf_set_cache_size(BGZF *fp, int cache_size) +{ + if (fp) fp->cache_size = cache_size; +} + +int bgzf_check_EOF(BGZF *fp) +{ + static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; + uint8_t buf[28]; + off_t offset; +#ifdef _USE_KNETFILE + offset = knet_tell(fp->x.fpr); + if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1; + knet_read(fp->x.fpr, buf, 28); + knet_seek(fp->x.fpr, offset, SEEK_SET); +#else + offset = ftello(fp->file); + if (fseeko(fp->file, -28, SEEK_END) != 0) return -1; + fread(buf, 1, 28, fp->file); + fseeko(fp->file, offset, SEEK_SET); +#endif + return (memcmp(magic, buf, 28) == 0)? 1 : 0; +} + +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) +{ + int block_offset; + int64_t block_address; + + if (fp->open_mode != 'r') { + report_error(fp, "file not open for read"); + return -1; + } + if (where != SEEK_SET) { + report_error(fp, "unimplemented seek option"); + return -1; + } + block_offset = pos & 0xFFFF; + block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; +#ifdef _USE_KNETFILE + if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) { +#else + if (fseeko(fp->file, block_address, SEEK_SET) != 0) { +#endif + report_error(fp, "seek failed"); + return -1; + } + fp->block_length = 0; // indicates current block is not loaded + fp->block_address = block_address; + fp->block_offset = block_offset; + return 0; +}