/* The MIT License
Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+ 2011 Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
THE SOFTWARE.
*/
-/*
- 2009-06-29 by lh3: cache recent uncompressed blocks.
- 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
- 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
-
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
-#include <fcntl.h>
+#include <assert.h>
#include <sys/types.h>
-#include <sys/stat.h>
#include "bgzf.h"
-#include "khash.h"
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+typedef knetFile *_bgzf_file_t;
+#define _bgzf_open(fn, mode) knet_open(fn, mode)
+#define _bgzf_dopen(fp, mode) knet_dopen(fp, mode)
+#define _bgzf_close(fp) knet_close(fp)
+#define _bgzf_fileno(fp) ((fp)->fd)
+#define _bgzf_tell(fp) knet_tell(fp)
+#define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence)
+#define _bgzf_read(fp, buf, len) knet_read(fp, buf, len)
+#define _bgzf_write(fp, buf, len) knet_write(fp, buf, len)
+#else // ~defined(_USE_KNETFILE)
+#if defined(_WIN32) || defined(_MSC_VER)
+#define ftello(fp) ftell(fp)
+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
+#else // ~defined(_WIN32)
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+#endif // ~defined(_WIN32)
+typedef FILE *_bgzf_file_t;
+#define _bgzf_open(fn, mode) fopen(fn, mode)
+#define _bgzf_dopen(fp, mode) fdopen(fp, mode)
+#define _bgzf_close(fp) fclose(fp)
+#define _bgzf_fileno(fp) fileno(fp)
+#define _bgzf_tell(fp) ftello(fp)
+#define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence)
+#define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp)
+#define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp)
+#endif // ~define(_USE_KNETFILE)
+
+#define BLOCK_HEADER_LENGTH 18
+#define BLOCK_FOOTER_LENGTH 8
+
+/* BGZF/GZIP header (speciallized from RFC 1952; little endian):
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN|
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+*/
+static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
+
+#ifdef BGZF_CACHE
typedef struct {
int size;
uint8_t *block;
int64_t end_offset;
} cache_t;
+#include "khash.h"
KHASH_MAP_INIT_INT64(cache, cache_t)
-
-#if defined(_WIN32) || defined(_MSC_VER)
-#define ftello(fp) ftell(fp)
-#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
-#else
-extern off_t ftello(FILE *stream);
-extern int fseeko(FILE *stream, off_t offset, int whence);
#endif
-typedef int8_t bgzf_byte_t;
-
-static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
-static const int MAX_BLOCK_SIZE = 64 * 1024;
-
-static const int BLOCK_HEADER_LENGTH = 18;
-static const int BLOCK_FOOTER_LENGTH = 8;
-
-static const int GZIP_ID1 = 31;
-static const int GZIP_ID2 = 139;
-static const int CM_DEFLATE = 8;
-static const int FLG_FEXTRA = 4;
-static const int OS_UNKNOWN = 255;
-static const int BGZF_ID1 = 66; // 'B'
-static const int BGZF_ID2 = 67; // 'C'
-static const int BGZF_LEN = 2;
-static const int BGZF_XLEN = 6; // BGZF_LEN+4
-
-static const int GZIP_WINDOW_BITS = -15; // no zlib header
-static const int Z_DEFAULT_MEM_LEVEL = 8;
-
-
-inline
-void
-packInt16(uint8_t* buffer, uint16_t value)
-{
- buffer[0] = value;
- buffer[1] = value >> 8;
-}
-
-inline
-int
-unpackInt16(const uint8_t* buffer)
+static inline void packInt16(uint8_t *buffer, uint16_t value)
{
- return (buffer[0] | (buffer[1] << 8));
+ buffer[0] = value;
+ buffer[1] = value >> 8;
}
-inline
-void
-packInt32(uint8_t* buffer, uint32_t value)
+static inline int unpackInt16(const uint8_t *buffer)
{
- buffer[0] = value;
- buffer[1] = value >> 8;
- buffer[2] = value >> 16;
- buffer[3] = value >> 24;
+ return buffer[0] | buffer[1] << 8;
}
-static inline
-int
-bgzf_min(int x, int y)
+static inline void packInt32(uint8_t *buffer, uint32_t value)
{
- return (x < y) ? x : y;
-}
-
-static
-void
-report_error(BGZF* fp, const char* message) {
- fp->error = message;
-}
-
-int bgzf_check_bgzf(const char *fn)
-{
- BGZF *fp;
- uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377";
- int n;
-
- if ((fp = bgzf_open(fn, "r")) == 0)
- {
- fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn);
- return -1;
- }
-
-#ifdef _USE_KNETFILE
- n = knet_read(fp->x.fpr, buf, 10);
-#else
- n = fread(buf, 1, 10, fp->file);
-#endif
- bgzf_close(fp);
-
- if ( n!=10 )
- return -1;
-
- if ( !memcmp(magic, buf, 10) ) return 1;
- return 0;
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+ buffer[2] = value >> 16;
+ buffer[3] = value >> 24;
}
static BGZF *bgzf_read_init()
{
BGZF *fp;
fp = calloc(1, sizeof(BGZF));
- fp->uncompressed_block_size = MAX_BLOCK_SIZE;
- fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
- fp->compressed_block_size = MAX_BLOCK_SIZE;
- fp->compressed_block = malloc(MAX_BLOCK_SIZE);
- fp->cache_size = 0;
+ fp->open_mode = 'r';
+ fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE);
+ fp->compressed_block = malloc(BGZF_BLOCK_SIZE);
+#ifdef BGZF_CACHE
fp->cache = kh_init(cache);
- return fp;
-}
-
-static
-BGZF*
-open_read(int fd)
-{
-#ifdef _USE_KNETFILE
- knetFile *file = knet_dopen(fd, "r");
-#else
- FILE* file = fdopen(fd, "r");
-#endif
- BGZF* fp;
- if (file == 0) return 0;
- fp = bgzf_read_init();
- fp->file_descriptor = fd;
- fp->open_mode = 'r';
-#ifdef _USE_KNETFILE
- fp->x.fpr = file;
-#else
- fp->file = file;
#endif
- return fp;
+ return fp;
}
-static
-BGZF*
-open_write(int fd, int compress_level) // compress_level==-1 for the default level
+static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level
{
- FILE* file = fdopen(fd, "w");
- BGZF* fp;
- if (file == 0) return 0;
- fp = malloc(sizeof(BGZF));
- fp->file_descriptor = fd;
- fp->open_mode = 'w';
- fp->owned_file = 0;
+ BGZF *fp;
+ fp = calloc(1, sizeof(BGZF));
+ fp->open_mode = 'w';
+ fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE);
+ fp->compressed_block = malloc(BGZF_BLOCK_SIZE);
fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
-#ifdef _USE_KNETFILE
- fp->x.fpw = file;
-#else
- fp->file = file;
-#endif
- fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
- fp->uncompressed_block = NULL;
- fp->compressed_block_size = MAX_BLOCK_SIZE;
- fp->compressed_block = malloc(MAX_BLOCK_SIZE);
- fp->block_address = 0;
- fp->block_offset = 0;
- fp->block_length = 0;
- fp->error = NULL;
- return fp;
+ return fp;
+}
+// get the compress level from the mode string
+static int mode2level(const char *__restrict mode)
+{
+ int i, compress_level = -1;
+ for (i = 0; mode[i]; ++i)
+ if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = (int)mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ return compress_level;
}
-BGZF*
-bgzf_open(const char* __restrict path, const char* __restrict mode)
+BGZF *bgzf_open(const char *path, const char *mode)
{
- BGZF* fp = NULL;
- if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */
-#ifdef _USE_KNETFILE
- knetFile *file = knet_open(path, mode);
- if (file == 0) return 0;
+ BGZF *fp = 0;
+ if (strchr(mode, 'r') || strchr(mode, 'R')) {
+ _bgzf_file_t fpr;
+ if ((fpr = _bgzf_open(path, "r")) == 0) return 0;
fp = bgzf_read_init();
- fp->file_descriptor = -1;
- fp->open_mode = 'r';
- fp->x.fpr = file;
-#else
- int fd, oflag = O_RDONLY;
-#ifdef _WIN32
- oflag |= O_BINARY;
-#endif
- fd = open(path, oflag);
- if (fd == -1) return 0;
- fp = open_read(fd);
-#endif
- } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
- int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC;
-#ifdef _WIN32
- oflag |= O_BINARY;
-#endif
- fd = open(path, oflag, 0666);
- if (fd == -1) return 0;
- { // set compress_level
- int i;
- for (i = 0; mode[i]; ++i)
- if (mode[i] >= '0' && mode[i] <= '9') break;
- if (mode[i]) compress_level = (int)mode[i] - '0';
- if (strchr(mode, 'u')) compress_level = 0;
- }
- fp = open_write(fd, compress_level);
- }
- if (fp != NULL) fp->owned_file = 1;
- return fp;
+ fp->fp = fpr;
+ } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+ FILE *fpw;
+ if ((fpw = fopen(path, "w")) == 0) return 0;
+ fp = bgzf_write_init(mode2level(mode));
+ fp->fp = fpw;
+ }
+ return fp;
}
-BGZF*
-bgzf_fdopen(int fd, const char * __restrict mode)
+BGZF *bgzf_dopen(int fd, const char *mode)
{
- if (fd == -1) return 0;
- if (mode[0] == 'r' || mode[0] == 'R') {
- return open_read(fd);
- } else if (mode[0] == 'w' || mode[0] == 'W') {
- int i, compress_level = -1;
- for (i = 0; mode[i]; ++i)
- if (mode[i] >= '0' && mode[i] <= '9') break;
- if (mode[i]) compress_level = (int)mode[i] - '0';
- if (strchr(mode, 'u')) compress_level = 0;
- return open_write(fd, compress_level);
- } else {
- return NULL;
- }
+ BGZF *fp = 0;
+ if (strchr(mode, 'r') || strchr(mode, 'R')) {
+ _bgzf_file_t fpr;
+ if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0;
+ fp = bgzf_read_init();
+ fp->fp = fpr;
+ } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+ FILE *fpw;
+ if ((fpw = fdopen(fd, "w")) == 0) return 0;
+ fp = bgzf_write_init(mode2level(mode));
+ fp->fp = fpw;
+ }
+ return fp;
}
-static
-int
-deflate_block(BGZF* fp, int block_length)
+// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
+static int deflate_block(BGZF *fp, int block_length)
{
- // Deflate the block in fp->uncompressed_block into fp->compressed_block.
- // Also adds an extra field that stores the compressed block length.
-
- bgzf_byte_t* buffer = fp->compressed_block;
- int buffer_size = fp->compressed_block_size;
-
- // Init gzip header
- buffer[0] = GZIP_ID1;
- buffer[1] = GZIP_ID2;
- buffer[2] = CM_DEFLATE;
- buffer[3] = FLG_FEXTRA;
- buffer[4] = 0; // mtime
- buffer[5] = 0;
- buffer[6] = 0;
- buffer[7] = 0;
- buffer[8] = 0;
- buffer[9] = OS_UNKNOWN;
- buffer[10] = BGZF_XLEN;
- buffer[11] = 0;
- buffer[12] = BGZF_ID1;
- buffer[13] = BGZF_ID2;
- buffer[14] = BGZF_LEN;
- buffer[15] = 0;
- buffer[16] = 0; // placeholder for block length
- buffer[17] = 0;
-
- // loop to retry for blocks that do not compress enough
- int input_length = block_length;
- int compressed_length = 0;
- while (1) {
- z_stream zs;
- zs.zalloc = NULL;
- zs.zfree = NULL;
- zs.next_in = fp->uncompressed_block;
- zs.avail_in = input_length;
- zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
- zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
-
- int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED,
- GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
- if (status != Z_OK) {
- report_error(fp, "deflate init failed");
- return -1;
- }
- status = deflate(&zs, Z_FINISH);
- if (status != Z_STREAM_END) {
- deflateEnd(&zs);
- if (status == Z_OK) {
- // Not enough space in buffer.
- // Can happen in the rare case the input doesn't compress enough.
- // Reduce the amount of input until it fits.
- input_length -= 1024;
- if (input_length <= 0) {
- // should never happen
- report_error(fp, "input reduction failed");
- return -1;
- }
- continue;
- }
- report_error(fp, "deflate failed");
- return -1;
- }
- status = deflateEnd(&zs);
- if (status != Z_OK) {
- report_error(fp, "deflate end failed");
- return -1;
- }
- compressed_length = zs.total_out;
- compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
- if (compressed_length > MAX_BLOCK_SIZE) {
- // should never happen
- report_error(fp, "deflate overflow");
- return -1;
- }
- break;
- }
+ uint8_t *buffer = fp->compressed_block;
+ int buffer_size = BGZF_BLOCK_SIZE;
+ int input_length = block_length;
+ int compressed_length = 0;
+ int remaining;
+ uint32_t crc;
+
+ assert(block_length <= BGZF_BLOCK_SIZE); // guaranteed by the caller
+ memcpy(buffer, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
+ while (1) { // loop to retry for blocks that do not compress enough
+ int status;
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = fp->uncompressed_block;
+ zs.avail_in = input_length;
+ zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
+ zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+ status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY); // -15 to disable zlib header/footer
+ if (status != Z_OK) {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ status = deflate(&zs, Z_FINISH);
+ if (status != Z_STREAM_END) { // not compressed enough
+ deflateEnd(&zs); // reset the stream
+ if (status == Z_OK) { // reduce the size and recompress
+ input_length -= 1024;
+ assert(input_length > 0); // logically, this should not happen
+ continue;
+ }
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ if (deflateEnd(&zs) != Z_OK) {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ compressed_length = zs.total_out;
+ compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+ assert(compressed_length <= BGZF_BLOCK_SIZE);
+ break;
+ }
- packInt16((uint8_t*)&buffer[16], compressed_length-1);
- uint32_t crc = crc32(0L, NULL, 0L);
- crc = crc32(crc, fp->uncompressed_block, input_length);
- packInt32((uint8_t*)&buffer[compressed_length-8], crc);
- packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
-
- int remaining = block_length - input_length;
- if (remaining > 0) {
- if (remaining > input_length) {
- // should never happen (check so we can use memcpy)
- report_error(fp, "remainder too large");
- return -1;
- }
- memcpy(fp->uncompressed_block,
- fp->uncompressed_block + input_length,
- remaining);
- }
- fp->block_offset = remaining;
- return compressed_length;
+ assert(compressed_length > 0);
+ packInt16((uint8_t*)&buffer[16], compressed_length - 1); // write the compressed_length; -1 to fit 2 bytes
+ crc = crc32(0L, NULL, 0L);
+ crc = crc32(crc, fp->uncompressed_block, input_length);
+ packInt32((uint8_t*)&buffer[compressed_length-8], crc);
+ packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
+
+ remaining = block_length - input_length;
+ if (remaining > 0) {
+ assert(remaining <= input_length);
+ memcpy(fp->uncompressed_block, fp->uncompressed_block + input_length, remaining);
+ }
+ fp->block_offset = remaining;
+ return compressed_length;
}
-static
-int
-inflate_block(BGZF* fp, int block_length)
+// Inflate the block in fp->compressed_block into fp->uncompressed_block
+static int inflate_block(BGZF* fp, int block_length)
{
- // Inflate the block in fp->compressed_block into fp->uncompressed_block
-
- z_stream zs;
- int status;
- zs.zalloc = NULL;
- zs.zfree = NULL;
- zs.next_in = fp->compressed_block + 18;
- zs.avail_in = block_length - 16;
- zs.next_out = fp->uncompressed_block;
- zs.avail_out = fp->uncompressed_block_size;
-
- status = inflateInit2(&zs, GZIP_WINDOW_BITS);
- if (status != Z_OK) {
- report_error(fp, "inflate init failed");
- return -1;
- }
- status = inflate(&zs, Z_FINISH);
- if (status != Z_STREAM_END) {
- inflateEnd(&zs);
- report_error(fp, "inflate failed");
- return -1;
- }
- status = inflateEnd(&zs);
- if (status != Z_OK) {
- report_error(fp, "inflate failed");
- return -1;
- }
- return zs.total_out;
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = fp->compressed_block + 18;
+ zs.avail_in = block_length - 16;
+ zs.next_out = fp->uncompressed_block;
+ zs.avail_out = BGZF_BLOCK_SIZE;
+
+ if (inflateInit2(&zs, -15) != Z_OK) {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
+ inflateEnd(&zs);
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ if (inflateEnd(&zs) != Z_OK) {
+ fp->errcode |= BGZF_ERR_ZLIB;
+ return -1;
+ }
+ return zs.total_out;
}
-static
-int
-check_header(const bgzf_byte_t* header)
+static int check_header(const uint8_t *header)
{
- return (header[0] == GZIP_ID1 &&
- header[1] == (bgzf_byte_t) GZIP_ID2 &&
- header[2] == Z_DEFLATED &&
- (header[3] & FLG_FEXTRA) != 0 &&
- unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
- header[12] == BGZF_ID1 &&
- header[13] == BGZF_ID2 &&
- unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
+ return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0
+ && unpackInt16((uint8_t*)&header[10]) == 6
+ && header[12] == 'B' && header[13] == 'C'
+ && unpackInt16((uint8_t*)&header[14]) == 2);
}
+#ifdef BGZF_CACHE
static void free_cache(BGZF *fp)
{
khint_t k;
if (fp->block_length != 0) fp->block_offset = 0;
fp->block_address = block_address;
fp->block_length = p->size;
- memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
-#ifdef _USE_KNETFILE
- knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
-#else
- fseeko(fp->file, p->end_offset, SEEK_SET);
-#endif
+ memcpy(fp->uncompressed_block, p->block, BGZF_BLOCK_SIZE);
+ _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET);
return p->size;
}
khint_t k;
cache_t *p;
khash_t(cache) *h = (khash_t(cache)*)fp->cache;
- if (MAX_BLOCK_SIZE >= fp->cache_size) return;
- if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
+ if (BGZF_BLOCK_SIZE >= fp->cache_size) return;
+ if ((kh_size(h) + 1) * BGZF_BLOCK_SIZE > fp->cache_size) {
/* A better way would be to remove the oldest block in the
* cache, but here we remove a random one for simplicity. This
* should not have a big impact on performance. */
p = &kh_val(h, k);
p->size = fp->block_length;
p->end_offset = fp->block_address + size;
- p->block = malloc(MAX_BLOCK_SIZE);
- memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
+ p->block = malloc(BGZF_BLOCK_SIZE);
+ memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_BLOCK_SIZE);
}
+#else
+static void free_cache(BGZF *fp) {}
+static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
+static void cache_block(BGZF *fp, int size) {}
+#endif
-int
-bgzf_read_block(BGZF* fp)
+int bgzf_read_block(BGZF *fp)
{
- bgzf_byte_t header[BLOCK_HEADER_LENGTH];
+ uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
int count, size = 0, block_length, remaining;
-#ifdef _USE_KNETFILE
- int64_t block_address = knet_tell(fp->x.fpr);
- if (load_block_from_cache(fp, block_address)) return 0;
- count = knet_read(fp->x.fpr, header, sizeof(header));
-#else
- int64_t block_address = ftello(fp->file);
+ int64_t block_address;
+ block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
if (load_block_from_cache(fp, block_address)) return 0;
- count = fread(header, 1, sizeof(header), fp->file);
-#endif
- if (count == 0) {
- fp->block_length = 0;
- return 0;
- }
+ count = _bgzf_read(fp->fp, header, sizeof(header));
+ if (count == 0) { // no data read
+ fp->block_length = 0;
+ return 0;
+ }
+ if (count != sizeof(header) || !check_header(header)) {
+ fp->errcode |= BGZF_ERR_HEADER;
+ return -1;
+ }
size = count;
- if (count != sizeof(header)) {
- report_error(fp, "read failed");
- return -1;
- }
- if (!check_header(header)) {
- report_error(fp, "invalid block header");
- return -1;
- }
- block_length = unpackInt16((uint8_t*)&header[16]) + 1;
- bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
- memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
- remaining = block_length - BLOCK_HEADER_LENGTH;
-#ifdef _USE_KNETFILE
- count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
-#else
- count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
-#endif
- if (count != remaining) {
- report_error(fp, "read failed");
- return -1;
- }
+ block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
+ compressed_block = (uint8_t*)fp->compressed_block;
+ memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+ remaining = block_length - BLOCK_HEADER_LENGTH;
+ count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
+ if (count != remaining) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
size += count;
- count = inflate_block(fp, block_length);
- if (count < 0) return -1;
- if (fp->block_length != 0) {
- // Do not reset offset if this read follows a seek.
- fp->block_offset = 0;
- }
- fp->block_address = block_address;
- fp->block_length = count;
+ if ((count = inflate_block(fp, block_length)) < 0) return -1;
+ if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
+ fp->block_address = block_address;
+ fp->block_length = count;
cache_block(fp, size);
- return 0;
+ return 0;
}
-int
-bgzf_read(BGZF* fp, void* data, int length)
+ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length)
{
- if (length <= 0) {
- return 0;
- }
- if (fp->open_mode != 'r') {
- report_error(fp, "file not open for reading");
- return -1;
- }
-
- int bytes_read = 0;
- bgzf_byte_t* output = data;
- while (bytes_read < length) {
- int copy_length, available = fp->block_length - fp->block_offset;
- bgzf_byte_t *buffer;
- if (available <= 0) {
- if (bgzf_read_block(fp) != 0) {
- return -1;
- }
- available = fp->block_length - fp->block_offset;
- if (available <= 0) {
- break;
- }
- }
- copy_length = bgzf_min(length-bytes_read, available);
- buffer = fp->uncompressed_block;
- memcpy(output, buffer + fp->block_offset, copy_length);
- fp->block_offset += copy_length;
- output += copy_length;
- bytes_read += copy_length;
- }
- if (fp->block_offset == fp->block_length) {
-#ifdef _USE_KNETFILE
- fp->block_address = knet_tell(fp->x.fpr);
-#else
- fp->block_address = ftello(fp->file);
-#endif
- fp->block_offset = 0;
- fp->block_length = 0;
- }
- return bytes_read;
+ ssize_t bytes_read = 0;
+ uint8_t *output = data;
+ if (length <= 0) return 0;
+ assert(fp->open_mode == 'r');
+ while (bytes_read < length) {
+ int copy_length, available = fp->block_length - fp->block_offset;
+ uint8_t *buffer;
+ if (available <= 0) {
+ if (bgzf_read_block(fp) != 0) return -1;
+ available = fp->block_length - fp->block_offset;
+ if (available <= 0) break;
+ }
+ copy_length = length - bytes_read < available? length - bytes_read : available;
+ buffer = fp->uncompressed_block;
+ memcpy(output, buffer + fp->block_offset, copy_length);
+ fp->block_offset += copy_length;
+ output += copy_length;
+ bytes_read += copy_length;
+ }
+ if (fp->block_offset == fp->block_length) {
+ fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+ fp->block_offset = fp->block_length = 0;
+ }
+ return bytes_read;
}
-int bgzf_flush(BGZF* fp)
+int bgzf_flush(BGZF *fp)
{
- while (fp->block_offset > 0) {
- int count, block_length;
+ assert(fp->open_mode == 'w');
+ while (fp->block_offset > 0) {
+ int block_length;
block_length = deflate_block(fp, fp->block_offset);
- if (block_length < 0) return -1;
-#ifdef _USE_KNETFILE
- count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
-#else
- count = fwrite(fp->compressed_block, 1, block_length, fp->file);
-#endif
- if (count != block_length) {
- report_error(fp, "write failed");
- return -1;
- }
- fp->block_address += block_length;
- }
- return 0;
+ if (block_length < 0) return -1;
+ if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) {
+ fp->errcode |= BGZF_ERR_IO; // possibly truncated file
+ return -1;
+ }
+ fp->block_address += block_length;
+ }
+ return 0;
}
-int bgzf_flush_try(BGZF *fp, int size)
+int bgzf_flush_try(BGZF *fp, ssize_t size)
{
- if (fp->block_offset + size > fp->uncompressed_block_size)
+ if (fp->block_offset + size > BGZF_BLOCK_SIZE)
return bgzf_flush(fp);
return -1;
}
-int bgzf_write(BGZF* fp, const void* data, int length)
+ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length)
{
- const bgzf_byte_t *input = data;
- int block_length, bytes_written;
- if (fp->open_mode != 'w') {
- report_error(fp, "file not open for writing");
- return -1;
- }
-
- if (fp->uncompressed_block == NULL)
- fp->uncompressed_block = malloc(fp->uncompressed_block_size);
-
- input = data;
- block_length = fp->uncompressed_block_size;
- bytes_written = 0;
- while (bytes_written < length) {
- int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
- bgzf_byte_t* buffer = fp->uncompressed_block;
- memcpy(buffer + fp->block_offset, input, copy_length);
- fp->block_offset += copy_length;
- input += copy_length;
- bytes_written += copy_length;
- if (fp->block_offset == block_length) {
- if (bgzf_flush(fp) != 0) {
- break;
- }
- }
- }
- return bytes_written;
+ const uint8_t *input = data;
+ int block_length = BGZF_BLOCK_SIZE, bytes_written;
+ assert(fp->open_mode == 'w');
+ input = data;
+ bytes_written = 0;
+ while (bytes_written < length) {
+ uint8_t* buffer = fp->uncompressed_block;
+ int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written;
+ memcpy(buffer + fp->block_offset, input, copy_length);
+ fp->block_offset += copy_length;
+ input += copy_length;
+ bytes_written += copy_length;
+ if (fp->block_offset == block_length && bgzf_flush(fp)) break;
+ }
+ return bytes_written;
}
int bgzf_close(BGZF* fp)
{
- if (fp->open_mode == 'w') {
- if (bgzf_flush(fp) != 0) return -1;
- { // add an empty block
- int count, block_length = deflate_block(fp, 0);
-#ifdef _USE_KNETFILE
- count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
-#else
- count = fwrite(fp->compressed_block, 1, block_length, fp->file);
-#endif
+ int ret, count, block_length;
+ if (fp == 0) return -1;
+ if (fp->open_mode == 'w') {
+ if (bgzf_flush(fp) != 0) return -1;
+ block_length = deflate_block(fp, 0); // write an empty block
+ count = fwrite(fp->compressed_block, 1, block_length, fp->fp);
+ if (fflush(fp->fp) != 0) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
}
-#ifdef _USE_KNETFILE
- if (fflush(fp->x.fpw) != 0) {
-#else
- if (fflush(fp->file) != 0) {
-#endif
- report_error(fp, "flush failed");
- return -1;
- }
- }
- if (fp->owned_file) {
-#ifdef _USE_KNETFILE
- int ret;
- if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
- else ret = knet_close(fp->x.fpr);
- if (ret != 0) return -1;
-#else
- if (fclose(fp->file) != 0) return -1;
-#endif
- }
- free(fp->uncompressed_block);
- free(fp->compressed_block);
+ }
+ ret = fp->open_mode == 'w'? fclose(fp->fp) : _bgzf_close(fp->fp);
+ if (ret != 0) return -1;
+ free(fp->uncompressed_block);
+ free(fp->compressed_block);
free_cache(fp);
- free(fp);
- return 0;
+ free(fp);
+ return 0;
}
void bgzf_set_cache_size(BGZF *fp, int cache_size)
static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
uint8_t buf[28];
off_t offset;
-#ifdef _USE_KNETFILE
- offset = knet_tell(fp->x.fpr);
- if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
- knet_read(fp->x.fpr, buf, 28);
- knet_seek(fp->x.fpr, offset, SEEK_SET);
-#else
- offset = ftello(fp->file);
- if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
- fread(buf, 1, 28, fp->file);
- fseeko(fp->file, offset, SEEK_SET);
-#endif
+ offset = _bgzf_tell((_bgzf_file_t)fp->fp);
+ if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0;
+ _bgzf_read(fp->fp, buf, 28);
+ _bgzf_seek(fp->fp, offset, SEEK_SET);
return (memcmp(magic, buf, 28) == 0)? 1 : 0;
}
int block_offset;
int64_t block_address;
- if (fp->open_mode != 'r') {
- report_error(fp, "file not open for read");
- return -1;
- }
- if (where != SEEK_SET) {
- report_error(fp, "unimplemented seek option");
- return -1;
+ if (fp->open_mode != 'r' || where != SEEK_SET) {
+ fp->errcode |= BGZF_ERR_MISUSE;
+ return -1;
+ }
+ block_offset = pos & 0xFFFF;
+ block_address = pos >> 16;
+ if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) {
+ fp->errcode |= BGZF_ERR_IO;
+ return -1;
+ }
+ fp->block_length = 0; // indicates current block has not been loaded
+ fp->block_address = block_address;
+ fp->block_offset = block_offset;
+ return 0;
+}
+
+int bgzf_is_bgzf(const char *fn)
+{
+ uint8_t buf[16];
+ int n;
+ _bgzf_file_t fp;
+ if ((fp = _bgzf_open(fn, "r")) == 0) return 0;
+ n = _bgzf_read(fp, buf, 16);
+ _bgzf_close(fp);
+ if (n != 16) return 0;
+ return memcmp(g_magic, buf, 16) == 0? 1 : 0;
+}
+
+int bgzf_getc(BGZF *fp)
+{
+ int c;
+ if (fp->block_offset >= fp->block_length) {
+ if (bgzf_read_block(fp) != 0) return -2; /* error */
+ if (fp->block_length == 0) return -1; /* end-of-file */
+ }
+ c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+ if (fp->block_offset == fp->block_length) {
+ fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+ fp->block_offset = 0;
+ fp->block_length = 0;
}
- block_offset = pos & 0xFFFF;
- block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
-#ifdef _USE_KNETFILE
- if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
-#else
- if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
+ return c;
+}
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
- report_error(fp, "seek failed");
- return -1;
- }
- fp->block_length = 0; // indicates current block is not loaded
- fp->block_address = block_address;
- fp->block_offset = block_offset;
- return 0;
+
+int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
+{
+ int l, state = 0;
+ unsigned char *buf = (unsigned char*)fp->uncompressed_block;
+ str->l = 0;
+ do {
+ if (fp->block_offset >= fp->block_length) {
+ if (bgzf_read_block(fp) != 0) { state = -2; break; }
+ if (fp->block_length == 0) { state = -1; break; }
+ }
+ for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
+ if (l < fp->block_length) state = 1;
+ l -= fp->block_offset;
+ if (str->l + l + 1 >= str->m) {
+ str->m = str->l + l + 2;
+ kroundup32(str->m);
+ str->s = (char*)realloc(str->s, str->m);
+ }
+ memcpy(str->s + str->l, buf + fp->block_offset, l);
+ str->l += l;
+ fp->block_offset += l + 1;
+ if (fp->block_offset >= fp->block_length) {
+ fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ }
+ } while (state == 0);
+ if (str->l == 0 && state < 0) return state;
+ str->s[str->l] = 0;
+ return str->l;
}
/* The MIT License
Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+ 2011 Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
THE SOFTWARE.
*/
+/* The BGZF library was originally written by Bob Handsaker from the Broad
+ * Institute. It was later improved by the SAMtools developers. */
+
#ifndef __BGZF_H
#define __BGZF_H
#include <stdint.h>
#include <stdio.h>
#include <zlib.h>
-#ifdef _USE_KNETFILE
-#include "knetfile.h"
-#endif
-//typedef int8_t bool;
+#define BGZF_BLOCK_SIZE 0x10000 // 64k
+
+#define BGZF_ERR_ZLIB 1
+#define BGZF_ERR_HEADER 2
+#define BGZF_ERR_IO 4
+#define BGZF_ERR_MISUSE 8
typedef struct {
- int file_descriptor;
- char open_mode; // 'r' or 'w'
- int16_t owned_file, compress_level;
-#ifdef _USE_KNETFILE
- union {
- knetFile *fpr;
- FILE *fpw;
- } x;
-#else
- FILE* file;
-#endif
- int uncompressed_block_size;
- int compressed_block_size;
- void* uncompressed_block;
- void* compressed_block;
- int64_t block_address;
- int block_length;
- int block_offset;
+ int open_mode:8, compress_level:8, errcode:16;
int cache_size;
- const char* error;
+ int block_length, block_offset;
+ int64_t block_address;
+ void *uncompressed_block, *compressed_block;
void *cache; // a pointer to a hash table
+ void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading
} BGZF;
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
#ifdef __cplusplus
extern "C" {
#endif
-/*
- * Open an existing file descriptor for reading or writing.
- * Mode must be either "r" or "w".
- * A subsequent bgzf_close will not close the file descriptor.
- * Returns null on error.
- */
-BGZF* bgzf_fdopen(int fd, const char* __restrict mode);
-
-/*
- * Open the specified file for reading or writing.
- * Mode must be either "r" or "w".
- * Returns null on error.
- */
-BGZF* bgzf_open(const char* path, const char* __restrict mode);
-
-/*
- * Close the BGZ file and free all associated resources.
- * Does not close the underlying file descriptor if created with bgzf_fdopen.
- * Returns zero on success, -1 on error.
- */
-int bgzf_close(BGZF* fp);
-
-/*
- * Read up to length bytes from the file storing into data.
- * Returns the number of bytes actually read.
- * Returns zero on end of file.
- * Returns -1 on error.
- */
-int bgzf_read(BGZF* fp, void* data, int length);
-
-/*
- * Write length bytes from data to the file.
- * Returns the number of bytes written.
- * Returns -1 on error.
- */
-int bgzf_write(BGZF* fp, const void* data, int length);
-
-/*
- * Return a virtual file pointer to the current location in the file.
- * No interpetation of the value should be made, other than a subsequent
- * call to bgzf_seek can be used to position the file at the same point.
- * Return value is non-negative on success.
- * Returns -1 on error.
- */
-#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF))
-
-/*
- * Set the file to read from the location specified by pos, which must
- * be a value previously returned by bgzf_tell for this file (but not
- * necessarily one returned by this file handle).
- * The where argument must be SEEK_SET.
- * Seeking on a file opened for write is not supported.
- * Returns zero on success, -1 on error.
- */
-int64_t bgzf_seek(BGZF* fp, int64_t pos, int where);
-
-/*
- * Set the cache size. Zero to disable. By default, caching is
- * disabled. The recommended cache size for frequent random access is
- * about 8M bytes.
- */
-void bgzf_set_cache_size(BGZF *fp, int cache_size);
-
-int bgzf_check_EOF(BGZF *fp);
-int bgzf_read_block(BGZF* fp);
-int bgzf_flush(BGZF* fp);
-int bgzf_flush_try(BGZF *fp, int size);
-int bgzf_check_bgzf(const char *fn);
+ /******************
+ * Basic routines *
+ ******************/
+
+ /**
+ * Open an existing file descriptor for reading or writing.
+ *
+ * @param fd file descriptor
+ * @param mode mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies
+ * the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored.
+ * @return BGZF file handler; 0 on error
+ */
+ BGZF* bgzf_dopen(int fd, const char *mode);
+
+ /**
+ * Open the specified file for reading or writing.
+ */
+ BGZF* bgzf_open(const char* path, const char *mode);
+
+ /**
+ * Close the BGZF and free all associated resources.
+ *
+ * @param fp BGZF file handler
+ * @return 0 on success and -1 on error
+ */
+ int bgzf_close(BGZF *fp);
+
+ /**
+ * Read up to _length_ bytes from the file storing into _data_.
+ *
+ * @param fp BGZF file handler
+ * @param data data array to read into
+ * @param length size of data to read
+ * @return number of bytes actually read; 0 on end-of-file and -1 on error
+ */
+ ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length);
+
+ /**
+ * Write _length_ bytes from _data_ to the file.
+ *
+ * @param fp BGZF file handler
+ * @param data data array to write
+ * @param length size of data to write
+ * @return number of bytes actually written; -1 on error
+ */
+ ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length);
+
+ /**
+ * Write the data in the buffer to the file.
+ */
+ int bgzf_flush(BGZF *fp);
+
+ /**
+ * Return a virtual file pointer to the current location in the file.
+ * No interpetation of the value should be made, other than a subsequent
+ * call to bgzf_seek can be used to position the file at the same point.
+ * Return value is non-negative on success.
+ */
+ #define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF))
+
+ /**
+ * Set the file to read from the location specified by _pos_.
+ *
+ * @param fp BGZF file handler
+ * @param pos virtual file offset returned by bgzf_tell()
+ * @param whence must be SEEK_SET
+ * @return 0 on success and -1 on error
+ */
+ int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence);
+
+ /**
+ * Check if the BGZF end-of-file (EOF) marker is present
+ *
+ * @param fp BGZF file handler opened for reading
+ * @return 1 if EOF is present; 0 if not or on I/O error
+ */
+ int bgzf_check_EOF(BGZF *fp);
+
+ /**
+ * Check if a file is in the BGZF format
+ *
+ * @param fn file name
+ * @return 1 if _fn_ is BGZF; 0 if not or on I/O error
+ */
+ int bgzf_is_bgzf(const char *fn);
+
+ /*********************
+ * Advanced routines *
+ *********************/
+
+ /**
+ * Set the cache size. Only effective when compiled with -DBGZF_CACHE.
+ *
+ * @param fp BGZF file handler
+ * @param size size of cache in bytes; 0 to disable caching (default)
+ */
+ void bgzf_set_cache_size(BGZF *fp, int size);
+
+ /**
+ * Flush the file if the remaining buffer size is smaller than _size_
+ */
+ int bgzf_flush_try(BGZF *fp, ssize_t size);
+
+ /**
+ * Read one byte from a BGZF file. It is faster than bgzf_read()
+ * @param fp BGZF file handler
+ * @return byte read; -1 on end-of-file or error
+ */
+ int bgzf_getc(BGZF *fp);
+
+ /**
+ * Read one line from a BGZF file. It is faster than bgzf_getc()
+ *
+ * @param fp BGZF file handler
+ * @param delim delimitor
+ * @param str string to write to; must be initialized
+ * @return length of the string; 0 on end-of-file; negative on error
+ */
+ int bgzf_getline(BGZF *fp, int delim, kstring_t *str);
+
+ /**
+ * Read the next BGZF block.
+ */
+ int bgzf_read_block(BGZF *fp);
#ifdef __cplusplus
}
#endif
-static inline int bgzf_getc(BGZF *fp)
-{
- int c;
- if (fp->block_offset >= fp->block_length) {
- if (bgzf_read_block(fp) != 0) return -2; /* error */
- if (fp->block_length == 0) return -1; /* end-of-file */
- }
- c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
- if (fp->block_offset == fp->block_length) {
-#ifdef _USE_KNETFILE
- fp->block_address = knet_tell(fp->x.fpr);
-#else
- fp->block_address = ftello(fp->file);
-#endif
- fp->block_offset = 0;
- fp->block_length = 0;
- }
- return c;
-}
-
#endif
#include <errno.h>
#include "bgzf.h"
#include "tabix.h"
+#include "knetfile.h"
-#define PACKAGE_VERSION "0.2.5 (r964)"
+#define PACKAGE_VERSION "0.2.5 (r1005)"
#define error(...) { fprintf(stderr,__VA_ARGS__); return -1; }
error("%s: %s", header,strerror(errno));
int page_size = getpagesize();
char *buf = valloc(page_size);
- BGZF *bgzf_out = bgzf_fdopen(fileno(stdout), "w");
+ BGZF *bgzf_out = bgzf_dopen(fileno(stdout), "w");
ssize_t nread;
while ( (nread=fread(buf,1,page_size-1,fh))>0 )
{
if ( nread<page_size-1 && buf[nread-1]!='\n' )
buf[nread++] = '\n';
- if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %s\n",bgzf_out->error);
+ if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode);
}
fclose(fh);
if ( fp->block_length - skip_until > 0 )
{
if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0)
- error("Error: %s\n",fp->error);
+ error("Error: %d\n",fp->errcode);
}
if (bgzf_flush(bgzf_out) < 0)
- error("Error: %s\n",bgzf_out->error);
+ error("Error: %d\n",bgzf_out->errcode);
while (1)
{
#ifdef _USE_KNETFILE
- nread = knet_read(fp->x.fpr, buf, page_size);
+ nread = knet_read(fp->fp, buf, page_size);
#else
- nread = fread(buf, 1, page_size, fp->file);
+ nread = fread(buf, 1, page_size, fp->fp);
#endif
if ( nread<=0 )
break;
-#ifdef _USE_KNETFILE
- int count = fwrite(buf, 1, nread, bgzf_out->x.fpw);
-#else
- int count = fwrite(buf, 1, nread, bgzf_out->file);
-#endif
+ int count = fwrite(buf, 1, nread, bgzf_out->fp);
if (count != nread)
error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
}
if (bgzf_close(bgzf_out) < 0)
- error("Error: %s\n",bgzf_out->error);
+ error("Error: %d\n",bgzf_out->errcode);
return 0;
}
int main(int argc, char *argv[])
{
- int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, bed_reg = 0;
- ti_conf_t conf = ti_conf_gff;
+ int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, print_only_header = 0, bed_reg = 0;
+ ti_conf_t conf = ti_conf_gff, *conf_ptr = NULL;
const char *reheader = NULL;
- while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhfBr:")) >= 0) {
+ while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhHfBr:")) >= 0) {
switch (c) {
case 'B': bed_reg = 1; break;
case '0': conf.preset |= TI_FLAG_UCSC; break;
case 'S': skip = atoi(optarg); break;
case 'c': meta = optarg[0]; break;
case 'p':
- if (strcmp(optarg, "gff") == 0) conf = ti_conf_gff;
- else if (strcmp(optarg, "bed") == 0) conf = ti_conf_bed;
- else if (strcmp(optarg, "sam") == 0) conf = ti_conf_sam;
- else if (strcmp(optarg, "vcf") == 0 || strcmp(optarg, "vcf4") == 0) conf = ti_conf_vcf;
- else if (strcmp(optarg, "psltbl") == 0) conf = ti_conf_psltbl;
+ if (strcmp(optarg, "gff") == 0) conf_ptr = &ti_conf_gff;
+ else if (strcmp(optarg, "bed") == 0) conf_ptr = &ti_conf_bed;
+ else if (strcmp(optarg, "sam") == 0) conf_ptr = &ti_conf_sam;
+ else if (strcmp(optarg, "vcf") == 0 || strcmp(optarg, "vcf4") == 0) conf_ptr = &ti_conf_vcf;
+ else if (strcmp(optarg, "psltbl") == 0) conf_ptr = &ti_conf_psltbl;
else {
fprintf(stderr, "[main] unrecognized preset '%s'\n", optarg);
return 1;
case 'e': conf.ec = atoi(optarg); break;
case 'l': list_chrms = 1; break;
case 'h': print_header = 1; break;
+ case 'H': print_only_header = 1; break;
case 'f': force = 1; break;
case 'r': reheader = optarg; break;
}
}
- if (skip >= 0) conf.line_skip = skip;
- if (meta >= 0) conf.meta_char = meta;
if (optind == argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Program: tabix (TAB-delimited file InderXer)\n");
fprintf(stderr, " -r FILE replace the header with the content of FILE [null]\n");
fprintf(stderr, " -B region1 is a BED file (entire file will be read)\n");
fprintf(stderr, " -0 zero-based coordinate\n");
- fprintf(stderr, " -h print the header lines\n");
+ fprintf(stderr, " -h print also the header lines\n");
+ fprintf(stderr, " -H print only the header lines\n");
fprintf(stderr, " -l list chromosome names\n");
fprintf(stderr, " -f force to overwrite the index\n");
fprintf(stderr, "\n");
return 1;
}
+ if ( !conf_ptr )
+ {
+ int l = strlen(argv[optind]);
+ int strcasecmp(const char *s1, const char *s2);
+ if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &ti_conf_gff;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &ti_conf_bed;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &ti_conf_sam;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &ti_conf_vcf;
+ else if (l>=10 && strcasecmp(argv[optind]+l-10, ".psltbl.gz") == 0) conf_ptr = &ti_conf_psltbl;
+ }
+ if ( conf_ptr )
+ conf = *conf_ptr;
+
+ if (skip >= 0) conf.line_skip = skip;
+ if (meta >= 0) conf.meta_char = meta;
if (list_chrms) {
ti_index_t *idx;
int i, n;
char *fnidx = calloc(strlen(argv[optind]) + 5, 1);
strcat(strcpy(fnidx, argv[optind]), ".tbi");
- if (optind + 1 == argc) {
+ if (optind + 1 == argc && !print_only_header) {
if (force == 0) {
if (stat(fnidx, &stat_tbi) == 0)
{
}
}
}
- if ( bgzf_check_bgzf(argv[optind])!=1 )
+ if ( bgzf_is_bgzf(argv[optind])!=1 )
{
fprintf(stderr,"[tabix] was bgzip used to compress this file? %s\n", argv[optind]);
free(fnidx);
return 1;
+ }
+ if ( !conf_ptr )
+ {
+ // Building the index but the file type was neither recognised nor given. If no custom change
+ // has been made, warn the user that GFF is used
+ if ( conf.preset==ti_conf_gff.preset
+ && conf.sc==ti_conf_gff.sc
+ && conf.bc==ti_conf_gff.bc
+ && conf.ec==ti_conf_gff.ec
+ && conf.meta_char==ti_conf_gff.meta_char
+ && conf.line_skip==ti_conf_gff.line_skip )
+ fprintf(stderr,"[tabix] The file type not recognised and -p not given, using the preset [gff].\n");
}
return ti_index_build(argv[optind], &conf);
}
{ // retrieve
tabix_t *t;
- // Common source of errors: new VCF is used with an old index
- stat(fnidx, &stat_tbi);
- stat(argv[optind], &stat_vcf);
- if ( force==0 && stat_vcf.st_mtime > stat_tbi.st_mtime )
+ // On some systems, stat on non-existent files returns undefined value for sm_mtime, the user had to use -f
+ int is_remote = (strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx) ? 1 : 0;
+ if ( !is_remote )
{
- fprintf(stderr, "[tabix] the index file is older than the vcf file. Please use '-f' to overwrite or reindex.\n");
- free(fnidx);
- return 1;
+ // Common source of errors: new VCF is used with an old index
+ stat(fnidx, &stat_tbi);
+ stat(argv[optind], &stat_vcf);
+ if ( force==0 && stat_vcf.st_mtime > stat_tbi.st_mtime )
+ {
+ fprintf(stderr, "[tabix] the index file either does not exist or is older than the vcf file. Please reindex.\n");
+ free(fnidx);
+ return 1;
+ }
}
free(fnidx);
fprintf(stderr, "[main] fail to open the data file.\n");
return 1;
}
+ if ( print_only_header )
+ {
+ ti_iter_t iter;
+ const char *s;
+ int len;
+ if (ti_lazy_index_load(t) < 0 && bed_reg == 0) {
+ fprintf(stderr,"[tabix] failed to load the index file.\n");
+ return 1;
+ }
+ const ti_conf_t *idxconf = ti_get_conf(t->idx);
+ iter = ti_query(t, 0, 0, 0);
+ while ((s = ti_read(t, iter, &len)) != 0) {
+ if ((int)(*s) != idxconf->meta_char) break;
+ fputs(s, stdout); fputc('\n', stdout);
+ }
+ ti_iter_destroy(iter);
+ return 0;
+ }
+
if (strcmp(argv[optind+1], ".") == 0) { // retrieve all
ti_iter_t iter;
const char *s;