From: Charles Plessy Date: Thu, 19 Aug 2010 06:49:09 +0000 (+0900) Subject: Imported Upstream version 0.1.0 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=tabix.git;a=commitdiff_plain;h=bc08e6c2f34c1367d0fe75dd19cedccfab881229 Imported Upstream version 0.1.0 --- bc08e6c2f34c1367d0fe75dd19cedccfab881229 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3d7153d --- /dev/null +++ b/Makefile @@ -0,0 +1,50 @@ +CC= gcc +CFLAGS= -g -Wall -O2 #-m64 #-arch ppc +DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE +LOBJS= bgzf.o kstring.o knetfile.o index.o +AOBJS= main.o +PROG= tabix bgzip +INCLUDES= +SUBDIRS= . +LIBPATH= +LIBCURSES= + +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all-recur lib-recur clean-recur cleanlocal-recur install-recur: + @target=`echo $@ | sed s/-recur//`; \ + wdir=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + cd $$subdir; \ + $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ + INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ + cd $$wdir; \ + done; + +all:$(PROG) + +lib:libtabix.a + +libtabix.a:$(LOBJS) + $(AR) -cru $@ $(LOBJS) + +tabix:lib $(AOBJS) + $(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) -lz -L. -ltabix + +bgzip:bgzip.o bgzf.o knetfile.o + $(CC) $(CFLAGS) -o $@ bgzip.o bgzf.o knetfile.o -lz + +kstring.o:kstring.h +knetfile.o:knetfile.h +bgzf.o:bgzf.h knetfile.h +index.o:bgzf.h tabix.h khash.h ksort.h kstring.h +main.o:tabix.h kstring.h bgzf.h +bgzip.o:bgzf.h + +cleanlocal: + rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a + +clean:cleanlocal-recur diff --git a/bam_endian.h b/bam_endian.h new file mode 100644 index 0000000..0fc74a8 --- /dev/null +++ b/bam_endian.h @@ -0,0 +1,42 @@ +#ifndef BAM_ENDIAN_H +#define BAM_ENDIAN_H + +#include + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} +static inline void *bam_swap_endian_2p(void *x) +{ + *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); + return x; +} +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} +static inline void *bam_swap_endian_4p(void *x) +{ + *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); + return x; +} +static inline uint64_t bam_swap_endian_8(uint64_t v) +{ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} +static inline void *bam_swap_endian_8p(void *x) +{ + *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); + return x; +} + +#endif diff --git a/bgzf.c b/bgzf.c new file mode 100644 index 0000000..7a936a8 --- /dev/null +++ b/bgzf.c @@ -0,0 +1,676 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/* + 2009-06-29 by lh3: cache recent uncompressed blocks. + 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP. + 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */ + +#include +#include +#include +#include +#include +#include +#include +#include "bgzf.h" + +#include "khash.h" +typedef struct { + int size; + uint8_t *block; + int64_t end_offset; +} cache_t; +KHASH_MAP_INIT_INT64(cache, cache_t) + +#if defined(_WIN32) || defined(_MSC_VER) +#define ftello(fp) ftell(fp) +#define fseeko(fp, offset, whence) fseek(fp, offset, whence) +#else +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); +#endif + +typedef int8_t bgzf_byte_t; + +static const int DEFAULT_BLOCK_SIZE = 64 * 1024; +static const int MAX_BLOCK_SIZE = 64 * 1024; + +static const int BLOCK_HEADER_LENGTH = 18; +static const int BLOCK_FOOTER_LENGTH = 8; + +static const int GZIP_ID1 = 31; +static const int GZIP_ID2 = 139; +static const int CM_DEFLATE = 8; +static const int FLG_FEXTRA = 4; +static const int OS_UNKNOWN = 255; +static const int BGZF_ID1 = 66; // 'B' +static const int BGZF_ID2 = 67; // 'C' +static const int BGZF_LEN = 2; +static const int BGZF_XLEN = 6; // BGZF_LEN+4 + +static const int GZIP_WINDOW_BITS = -15; // no zlib header +static const int Z_DEFAULT_MEM_LEVEL = 8; + + +inline +void +packInt16(uint8_t* buffer, uint16_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; +} + +inline +int +unpackInt16(const uint8_t* buffer) +{ + return (buffer[0] | (buffer[1] << 8)); +} + +inline +void +packInt32(uint8_t* buffer, uint32_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; + buffer[2] = value >> 16; + buffer[3] = value >> 24; +} + +static inline +int +bgzf_min(int x, int y) +{ + return (x < y) ? x : y; +} + +static +void +report_error(BGZF* fp, const char* message) { + fp->error = message; +} + +static BGZF *bgzf_read_init() +{ + BGZF *fp; + fp = calloc(1, sizeof(BGZF)); + fp->uncompressed_block_size = MAX_BLOCK_SIZE; + fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->cache_size = 0; + fp->cache = kh_init(cache); + return fp; +} + +static +BGZF* +open_read(int fd) +{ +#ifdef _USE_KNETFILE + knetFile *file = knet_dopen(fd, "r"); +#else + FILE* file = fdopen(fd, "r"); +#endif + BGZF* fp; + if (file == 0) return 0; + fp = bgzf_read_init(); + fp->file_descriptor = fd; + fp->open_mode = 'r'; +#ifdef _USE_KNETFILE + fp->x.fpr = file; +#else + fp->file = file; +#endif + return fp; +} + +static +BGZF* +open_write(int fd, bool is_uncompressed) +{ + FILE* file = fdopen(fd, "w"); + BGZF* fp; + if (file == 0) return 0; + fp = malloc(sizeof(BGZF)); + fp->file_descriptor = fd; + fp->open_mode = 'w'; + fp->owned_file = 0; fp->is_uncompressed = is_uncompressed; +#ifdef _USE_KNETFILE + fp->x.fpw = file; +#else + fp->file = file; +#endif + fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE; + fp->uncompressed_block = NULL; + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->block_address = 0; + fp->block_offset = 0; + fp->block_length = 0; + fp->error = NULL; + return fp; +} + +BGZF* +bgzf_open(const char* __restrict path, const char* __restrict mode) +{ + BGZF* fp = NULL; + if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */ +#ifdef _USE_KNETFILE + knetFile *file = knet_open(path, mode); + if (file == 0) return 0; + fp = bgzf_read_init(); + fp->file_descriptor = -1; + fp->open_mode = 'r'; + fp->x.fpr = file; +#else + int fd, oflag = O_RDONLY; +#ifdef _WIN32 + oflag |= O_BINARY; +#endif + fd = open(path, oflag); + if (fd == -1) return 0; + fp = open_read(fd); +#endif + } else if (mode[0] == 'w' || mode[0] == 'W') { + int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC; +#ifdef _WIN32 + oflag |= O_BINARY; +#endif + fd = open(path, oflag, 0666); + if (fd == -1) return 0; + fp = open_write(fd, strstr(mode, "u")? 1 : 0); + } + if (fp != NULL) { + fp->owned_file = 1; + } + return fp; +} + +BGZF* +bgzf_fdopen(int fd, const char * __restrict mode) +{ + if (fd == -1) return 0; + if (mode[0] == 'r' || mode[0] == 'R') { + return open_read(fd); + } else if (mode[0] == 'w' || mode[0] == 'W') { + return open_write(fd, strstr(mode, "u")? 1 : 0); + } else { + return NULL; + } +} + +static +int +deflate_block(BGZF* fp, int block_length) +{ + // Deflate the block in fp->uncompressed_block into fp->compressed_block. + // Also adds an extra field that stores the compressed block length. + + bgzf_byte_t* buffer = fp->compressed_block; + int buffer_size = fp->compressed_block_size; + + // Init gzip header + buffer[0] = GZIP_ID1; + buffer[1] = GZIP_ID2; + buffer[2] = CM_DEFLATE; + buffer[3] = FLG_FEXTRA; + buffer[4] = 0; // mtime + buffer[5] = 0; + buffer[6] = 0; + buffer[7] = 0; + buffer[8] = 0; + buffer[9] = OS_UNKNOWN; + buffer[10] = BGZF_XLEN; + buffer[11] = 0; + buffer[12] = BGZF_ID1; + buffer[13] = BGZF_ID2; + buffer[14] = BGZF_LEN; + buffer[15] = 0; + buffer[16] = 0; // placeholder for block length + buffer[17] = 0; + + // loop to retry for blocks that do not compress enough + int input_length = block_length; + int compressed_length = 0; + while (1) { + int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION; + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->uncompressed_block; + zs.avail_in = input_length; + zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; + zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + + int status = deflateInit2(&zs, compress_level, Z_DEFLATED, + GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (status != Z_OK) { + report_error(fp, "deflate init failed"); + return -1; + } + status = deflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + deflateEnd(&zs); + if (status == Z_OK) { + // Not enough space in buffer. + // Can happen in the rare case the input doesn't compress enough. + // Reduce the amount of input until it fits. + input_length -= 1024; + if (input_length <= 0) { + // should never happen + report_error(fp, "input reduction failed"); + return -1; + } + continue; + } + report_error(fp, "deflate failed"); + return -1; + } + status = deflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "deflate end failed"); + return -1; + } + compressed_length = zs.total_out; + compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + if (compressed_length > MAX_BLOCK_SIZE) { + // should never happen + report_error(fp, "deflate overflow"); + return -1; + } + break; + } + + packInt16((uint8_t*)&buffer[16], compressed_length-1); + uint32_t crc = crc32(0L, NULL, 0L); + crc = crc32(crc, fp->uncompressed_block, input_length); + packInt32((uint8_t*)&buffer[compressed_length-8], crc); + packInt32((uint8_t*)&buffer[compressed_length-4], input_length); + + int remaining = block_length - input_length; + if (remaining > 0) { + if (remaining > input_length) { + // should never happen (check so we can use memcpy) + report_error(fp, "remainder too large"); + return -1; + } + memcpy(fp->uncompressed_block, + fp->uncompressed_block + input_length, + remaining); + } + fp->block_offset = remaining; + return compressed_length; +} + +static +int +inflate_block(BGZF* fp, int block_length) +{ + // Inflate the block in fp->compressed_block into fp->uncompressed_block + + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->compressed_block + 18; + zs.avail_in = block_length - 16; + zs.next_out = fp->uncompressed_block; + zs.avail_out = fp->uncompressed_block_size; + + int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + if (status != Z_OK) { + report_error(fp, "inflate init failed"); + return -1; + } + status = inflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + inflateEnd(&zs); + report_error(fp, "inflate failed"); + return -1; + } + status = inflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "inflate failed"); + return -1; + } + return zs.total_out; +} + +static +int +check_header(const bgzf_byte_t* header) +{ + return (header[0] == GZIP_ID1 && + header[1] == (bgzf_byte_t) GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & FLG_FEXTRA) != 0 && + unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN && + header[12] == BGZF_ID1 && + header[13] == BGZF_ID2 && + unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); +} + +static void free_cache(BGZF *fp) +{ + khint_t k; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (fp->open_mode != 'r') return; + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) free(kh_val(h, k).block); + kh_destroy(cache, h); +} + +static int load_block_from_cache(BGZF *fp, int64_t block_address) +{ + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + k = kh_get(cache, h, block_address); + if (k == kh_end(h)) return 0; + p = &kh_val(h, k); + if (fp->block_length != 0) fp->block_offset = 0; + fp->block_address = block_address; + fp->block_length = p->size; + memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE); +#ifdef _USE_KNETFILE + knet_seek(fp->x.fpr, p->end_offset, SEEK_SET); +#else + fseeko(fp->file, p->end_offset, SEEK_SET); +#endif + return p->size; +} + +static void cache_block(BGZF *fp, int size) +{ + int ret; + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (MAX_BLOCK_SIZE >= fp->cache_size) return; + if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) { + /* A better way would be to remove the oldest block in the + * cache, but here we remove a random one for simplicity. This + * should not have a big impact on performance. */ + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) break; + if (k < kh_end(h)) { + free(kh_val(h, k).block); + kh_del(cache, h, k); + } + } + k = kh_put(cache, h, fp->block_address, &ret); + if (ret == 0) return; // if this happens, a bug! + p = &kh_val(h, k); + p->size = fp->block_length; + p->end_offset = fp->block_address + size; + p->block = malloc(MAX_BLOCK_SIZE); + memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); +} + +int +bgzf_read_block(BGZF* fp) +{ + bgzf_byte_t header[BLOCK_HEADER_LENGTH]; + int size = 0; +#ifdef _USE_KNETFILE + int64_t block_address = knet_tell(fp->x.fpr); + if (load_block_from_cache(fp, block_address)) return 0; + int count = knet_read(fp->x.fpr, header, sizeof(header)); +#else + int64_t block_address = ftello(fp->file); + if (load_block_from_cache(fp, block_address)) return 0; + int count = fread(header, 1, sizeof(header), fp->file); +#endif + if (count == 0) { + fp->block_length = 0; + return 0; + } + size = count; + if (count != sizeof(header)) { + report_error(fp, "read failed"); + return -1; + } + if (!check_header(header)) { + report_error(fp, "invalid block header"); + return -1; + } + int block_length = unpackInt16((uint8_t*)&header[16]) + 1; + bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block; + memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); + int remaining = block_length - BLOCK_HEADER_LENGTH; +#ifdef _USE_KNETFILE + count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); +#else + count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file); +#endif + if (count != remaining) { + report_error(fp, "read failed"); + return -1; + } + size += count; + count = inflate_block(fp, block_length); + if (count < 0) { + return -1; + } + if (fp->block_length != 0) { + // Do not reset offset if this read follows a seek. + fp->block_offset = 0; + } + fp->block_address = block_address; + fp->block_length = count; + cache_block(fp, size); + return 0; +} + +int +bgzf_read(BGZF* fp, void* data, int length) +{ + if (length <= 0) { + return 0; + } + if (fp->open_mode != 'r') { + report_error(fp, "file not open for reading"); + return -1; + } + + int bytes_read = 0; + bgzf_byte_t* output = data; + while (bytes_read < length) { + int available = fp->block_length - fp->block_offset; + if (available <= 0) { + if (bgzf_read_block(fp) != 0) { + return -1; + } + available = fp->block_length - fp->block_offset; + if (available <= 0) { + break; + } + } + int copy_length = bgzf_min(length-bytes_read, available); + bgzf_byte_t* buffer = fp->uncompressed_block; + memcpy(output, buffer + fp->block_offset, copy_length); + fp->block_offset += copy_length; + output += copy_length; + bytes_read += copy_length; + } + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return bytes_read; +} + +static +int +flush_block(BGZF* fp) +{ + while (fp->block_offset > 0) { + int block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) { + return -1; + } +#ifdef _USE_KNETFILE + int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); +#else + int count = fwrite(fp->compressed_block, 1, block_length, fp->file); +#endif + if (count != block_length) { + report_error(fp, "write failed"); + return -1; + } + fp->block_address += block_length; + } + return 0; +} + +int +bgzf_write(BGZF* fp, const void* data, int length) +{ + if (fp->open_mode != 'w') { + report_error(fp, "file not open for writing"); + return -1; + } + + if (fp->uncompressed_block == NULL) { + fp->uncompressed_block = malloc(fp->uncompressed_block_size); + } + + const bgzf_byte_t* input = data; + int block_length = fp->uncompressed_block_size; + int bytes_written = 0; + while (bytes_written < length) { + int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written); + bgzf_byte_t* buffer = fp->uncompressed_block; + memcpy(buffer + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; + input += copy_length; + bytes_written += copy_length; + if (fp->block_offset == block_length) { + if (flush_block(fp) != 0) { + break; + } + } + } + return bytes_written; +} + +int +bgzf_close(BGZF* fp) +{ + if (fp->open_mode == 'w') { + if (flush_block(fp) != 0) { + return -1; + } + { // add an empty block + int count, block_length = deflate_block(fp, 0); +#ifdef _USE_KNETFILE + count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); +#else + count = fwrite(fp->compressed_block, 1, block_length, fp->file); +#endif + } +#ifdef _USE_KNETFILE + if (fflush(fp->x.fpw) != 0) { +#else + if (fflush(fp->file) != 0) { +#endif + report_error(fp, "flush failed"); + return -1; + } + } + if (fp->owned_file) { +#ifdef _USE_KNETFILE + int ret; + if (fp->open_mode == 'w') ret = fclose(fp->x.fpw); + else ret = knet_close(fp->x.fpr); + if (ret != 0) return -1; +#else + if (fclose(fp->file) != 0) { + return -1; + } +#endif + } + free(fp->uncompressed_block); + free(fp->compressed_block); + free_cache(fp); + free(fp); + return 0; +} + +void bgzf_set_cache_size(BGZF *fp, int cache_size) +{ + if (fp) fp->cache_size = cache_size; +} + +int bgzf_check_EOF(BGZF *fp) +{ + static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; + uint8_t buf[28]; + off_t offset; +#ifdef _USE_KNETFILE + offset = knet_tell(fp->x.fpr); + if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1; + knet_read(fp->x.fpr, buf, 28); + knet_seek(fp->x.fpr, offset, SEEK_SET); +#else + offset = ftello(fp->file); + if (fseeko(fp->file, -28, SEEK_END) != 0) return -1; + fread(buf, 1, 28, fp->file); + fseeko(fp->file, offset, SEEK_SET); +#endif + return (memcmp(magic, buf, 28) == 0)? 1 : 0; +} + +int64_t +bgzf_seek(BGZF* fp, int64_t pos, int where) +{ + if (fp->open_mode != 'r') { + report_error(fp, "file not open for read"); + return -1; + } + if (where != SEEK_SET) { + report_error(fp, "unimplemented seek option"); + return -1; + } + int block_offset = pos & 0xFFFF; + int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; +#ifdef _USE_KNETFILE + if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) { +#else + if (fseeko(fp->file, block_address, SEEK_SET) != 0) { +#endif + report_error(fp, "seek failed"); + return -1; + } + fp->block_length = 0; // indicates current block is not loaded + fp->block_address = block_address; + fp->block_offset = block_offset; + return 0; +} diff --git a/bgzf.h b/bgzf.h new file mode 100644 index 0000000..f544a67 --- /dev/null +++ b/bgzf.h @@ -0,0 +1,156 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#ifndef __BGZF_H +#define __BGZF_H + +#include +#include +#include +#include +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +//typedef int8_t bool; + +typedef struct { + int file_descriptor; + char open_mode; // 'r' or 'w' + bool owned_file, is_uncompressed; +#ifdef _USE_KNETFILE + union { + knetFile *fpr; + FILE *fpw; + } x; +#else + FILE* file; +#endif + int uncompressed_block_size; + int compressed_block_size; + void* uncompressed_block; + void* compressed_block; + int64_t block_address; + int block_length; + int block_offset; + int cache_size; + const char* error; + void *cache; // a pointer to a hash table +} BGZF; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Open an existing file descriptor for reading or writing. + * Mode must be either "r" or "w". + * A subsequent bgzf_close will not close the file descriptor. + * Returns null on error. + */ +BGZF* bgzf_fdopen(int fd, const char* __restrict mode); + +/* + * Open the specified file for reading or writing. + * Mode must be either "r" or "w". + * Returns null on error. + */ +BGZF* bgzf_open(const char* path, const char* __restrict mode); + +/* + * Close the BGZ file and free all associated resources. + * Does not close the underlying file descriptor if created with bgzf_fdopen. + * Returns zero on success, -1 on error. + */ +int bgzf_close(BGZF* fp); + +/* + * Read up to length bytes from the file storing into data. + * Returns the number of bytes actually read. + * Returns zero on end of file. + * Returns -1 on error. + */ +int bgzf_read(BGZF* fp, void* data, int length); + +/* + * Write length bytes from data to the file. + * Returns the number of bytes written. + * Returns -1 on error. + */ +int bgzf_write(BGZF* fp, const void* data, int length); + +/* + * Return a virtual file pointer to the current location in the file. + * No interpetation of the value should be made, other than a subsequent + * call to bgzf_seek can be used to position the file at the same point. + * Return value is non-negative on success. + * Returns -1 on error. + */ +#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) + +/* + * Set the file to read from the location specified by pos, which must + * be a value previously returned by bgzf_tell for this file (but not + * necessarily one returned by this file handle). + * The where argument must be SEEK_SET. + * Seeking on a file opened for write is not supported. + * Returns zero on success, -1 on error. + */ +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); + +/* + * Set the cache size. Zero to disable. By default, caching is + * disabled. The recommended cache size for frequent random access is + * about 8M bytes. + */ +void bgzf_set_cache_size(BGZF *fp, int cache_size); + +int bgzf_check_EOF(BGZF *fp); + +int bgzf_read_block(BGZF* fp); + +#ifdef __cplusplus +} +#endif + +static inline int bgzf_getc(BGZF *fp) +{ + int c; + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) return -2; /* error */ + if (fp->block_length == 0) return -1; /* end-of-file */ + } + c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return c; +} + +#endif diff --git a/bgzip.c b/bgzip.c new file mode 100644 index 0000000..381ebc5 --- /dev/null +++ b/bgzip.c @@ -0,0 +1,179 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include "bgzf.h" + +static const int WINDOW_SIZE = 64 * 1024; + +static int is_ready(int fd) +{ + fd_set fdset; + struct timeval timeout; + FD_ZERO(&fdset); + FD_SET(fd, &fdset); + timeout.tv_sec = 0; timeout.tv_usec = 100000; + return select(1, &fdset, NULL, NULL, &timeout) == 1 ? 1 : 0; +} + +static int bgzip_main_usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n"); + fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n"); + fprintf(stderr, " -d decompress\n"); + fprintf(stderr, " -b INT decompress at virtual file pointer INT\n"); + fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n"); + fprintf(stderr, " -h give this help\n"); + fprintf(stderr, "\n"); + return 1; +} + +static int write_open(const char *fn, int is_forced) +{ + int fd = -1; + char c; + if (!is_forced) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) { + fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn); + scanf("%c", &c); + if (c != 'Y' && c != 'y') { + fprintf(stderr, "[bgzip] not overwritten\n"); + exit(1); + } + } + } + if (fd < 0) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { + fprintf(stderr, "[bgzip] %s: Fail to write\n", fn); + exit(1); + } + } + return fd; +} + +static void fail(BGZF* fp) +{ + fprintf(stderr, "Error: %s\n", fp->error); + exit(1); +} + +int main(int argc, char **argv) +{ + int c, compress, pstdout, is_forced; + BGZF *fp; + void *buffer; + long start, end, size; + + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; + while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){ + switch(c){ + case 'h': return bgzip_main_usage(); + case 'd': compress = 0; break; + case 'c': pstdout = 1; break; + case 'b': start = atol(optarg); break; + case 's': size = atol(optarg); break; + case 'f': is_forced = 1; break; + } + } + if (size >= 0) end = start + size; + if (end >= 0 && end < start) { + fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); + return 1; + } + if (compress == 1) { + int f_src, f_dst = -1; + if (is_ready(fileno(stdin))) pstdout = 1; + if (argc > optind && !pstdout) { + if ((f_src = open(argv[optind], O_RDONLY)) < 0) { + fprintf(stderr, "[bgzip] Cannot open file: %s\n", argv[optind]); + return 1; + } + if (pstdout) { + f_dst = fileno(stdout); + } else { + char *name = malloc(sizeof(strlen(argv[optind]) + 5)); + strcpy(name, argv[optind]); + strcat(name, ".gz"); + f_dst = write_open(name, is_forced); + if (f_dst < 0) return 1; + free(name); + } + } else if (pstdout) { + f_src = fileno(stdin); + f_dst = fileno(stdout); + } else return bgzip_main_usage(); + fp = bgzf_fdopen(f_dst, "w"); + buffer = malloc(WINDOW_SIZE); + while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) + if (bgzf_write(fp, buffer, c) < 0) fail(fp); + // f_dst will be closed here + if (bgzf_close(fp) < 0) fail(fp); + if (argc > optind) unlink(argv[optind]); + free(buffer); + close(f_src); + return 0; + } else { + int f_dst, is_stdin = 0; + if (argc == optind) pstdout = 1; + if (is_ready(fileno(stdin))) is_stdin = 1; + if (argc <= optind && !is_stdin) return bgzip_main_usage(); + if (argc > optind && !pstdout) { + char *name; + if (strstr(argv[optind], ".gz") - argv[optind] != strlen(argv[optind]) - 3) { + fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); + return 1; + } + name = strdup(argv[optind]); + name[strlen(name) - 3] = '\0'; + f_dst = write_open(name, is_forced); + free(name); + } else f_dst = fileno(stdout); + fp = (argc == optind)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[optind], "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); + return 1; + } + buffer = malloc(WINDOW_SIZE); + if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp); + while (1) { + if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); + else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); + if (c == 0) break; + if (c < 0) fail(fp); + start += c; + write(f_dst, buffer, c); + if (end >= 0 && start >= end) break; + } + free(buffer); + if (bgzf_close(fp) < 0) fail(fp); + if (!pstdout) unlink(argv[optind]); + return 0; + } +} diff --git a/index.c b/index.c new file mode 100644 index 0000000..cc56498 --- /dev/null +++ b/index.c @@ -0,0 +1,789 @@ +#include +#include +#include "khash.h" +#include "ksort.h" +#include "kstring.h" +#include "bam_endian.h" +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif +#include "tabix.h" + +#define TAD_MIN_CHUNK_GAP 32768 +// 1<<14 is the size of minimum bin. +#define TAD_LIDX_SHIFT 14 + +typedef struct { + uint64_t u, v; +} pair64_t; + +#define pair64_lt(a,b) ((a).u < (b).u) +KSORT_INIT(off, pair64_t, pair64_lt) + +typedef struct { + uint32_t m, n; + pair64_t *list; +} ti_binlist_t; + +typedef struct { + int32_t n, m; + uint64_t *offset; +} ti_lidx_t; + +KHASH_MAP_INIT_INT(i, ti_binlist_t) +KHASH_MAP_INIT_STR(s, int) + +struct __ti_index_t { + ti_conf_t conf; + int32_t n, max; + khash_t(s) *tname; + khash_t(i) **index; + ti_lidx_t *index2; +}; + +typedef struct { + int tid, beg, end, bin; +} ti_intv_t; + +ti_conf_t ti_conf_gff = { 0, 1, 4, 5, '#', 0 }; +ti_conf_t ti_conf_bed = { TI_FLAG_UCSC, 1, 2, 3, '#', 0 }; +ti_conf_t ti_conf_psltbl = { TI_FLAG_UCSC, 15, 17, 18, '#', 0 }; +ti_conf_t ti_conf_sam = { TI_PRESET_SAM, 3, 4, 0, '@', 0 }; +ti_conf_t ti_conf_vcf = { TI_PRESET_VCF, 1, 2, 0, '#', 0 }; + +/*************** + * read a line * + ***************/ + +/* +int ti_readline(BGZF *fp, kstring_t *str) +{ + int c, l = 0; + str->l = 0; + while ((c = bgzf_getc(fp)) >= 0 && c != '\n') { + ++l; + if (c != '\r') kputc(c, str); + } + if (c < 0 && l == 0) return -1; // end of file + return str->l; +} +*/ + +/* Below is a faster implementation largely equivalent to the one + * commented out above. */ +int ti_readline(BGZF *fp, kstring_t *str) +{ + int l, state = 0; + unsigned char *buf = (unsigned char*)fp->uncompressed_block; + str->l = 0; + do { + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) { state = -2; break; } + if (fp->block_length == 0) { state = -1; break; } + } + for (l = fp->block_offset; l < fp->block_length && buf[l] != '\n'; ++l); + if (l < fp->block_length) state = 1; + l -= fp->block_offset; + if (str->l + l + 1 >= str->m) { + str->m = str->l + l + 2; + kroundup32(str->m); + str->s = (char*)realloc(str->s, str->m); + } + memcpy(str->s + str->l, buf + fp->block_offset, l); + str->l += l; + fp->block_offset += l + 1; + if (fp->block_offset >= fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + } while (state == 0); + if (str->l == 0 && state < 0) return state; + str->s[str->l] = 0; + return str->l; +} + +/************************************* + * get the interval from a data line * + *************************************/ + +static inline int ti_reg2bin(uint32_t beg, uint32_t end) +{ + --end; + if (beg>>14 == end>>14) return 4681 + (beg>>14); + if (beg>>17 == end>>17) return 585 + (beg>>17); + if (beg>>20 == end>>20) return 73 + (beg>>20); + if (beg>>23 == end>>23) return 9 + (beg>>23); + if (beg>>26 == end>>26) return 1 + (beg>>26); + return 0; +} + +static int get_tid(ti_index_t *idx, const char *ss) +{ + khint_t k; + int tid; + k = kh_get(s, idx->tname, ss); + if (k == kh_end(idx->tname)) { // a new target sequence + int ret, size; + // update idx->n, ->max, ->index and ->index2 + if (idx->n == idx->max) { + idx->max = idx->max? idx->max<<1 : 8; + idx->index = realloc(idx->index, idx->max * sizeof(void*)); + idx->index2 = realloc(idx->index2, idx->max * sizeof(ti_lidx_t)); + } + memset(&idx->index2[idx->n], 0, sizeof(ti_lidx_t)); + idx->index[idx->n++] = kh_init(i); + // update ->tname + tid = size = kh_size(idx->tname); + k = kh_put(s, idx->tname, strdup(ss), &ret); + kh_value(idx->tname, k) = size; + assert(idx->n == kh_size(idx->tname)); + } else tid = kh_value(idx->tname, k); + return tid; +} + +static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv) +{ + int i, b = 0, id = 1; + char *s; + intv->tid = intv->beg = intv->end = intv->bin = -1; + for (i = 0; i <= str->l; ++i) { + if (str->s[i] == '\t' || str->s[i] == 0) { + if (id == idx->conf.sc) { + str->s[i] = 0; + intv->tid = get_tid(idx, str->s + b); + if (i != str->l) str->s[i] = '\t'; + } else if (id == idx->conf.bc) { + // here ->beg is 1-based. it will be changed to 0-based at the end of this routine. + intv->beg = strtol(str->s + b, &s, 0); + if (idx->conf.preset&TI_FLAG_UCSC) ++intv->beg; + } else { + if ((idx->conf.preset&0xffff) == TI_PRESET_GENERIC) { + if (id == idx->conf.ec) intv->end = strtol(str->s + b, &s, 0); + } else if ((idx->conf.preset&0xffff) == TI_PRESET_SAM) { + if (id == 6) { // CIGAR + int l = 0, op; + char *t; + for (s = str->s + b; s < str->s + i;) { + long x = strtol(s, &t, 10); + op = toupper(*t); + if (op == 'M' || op == 'D' || op == 'N') l += x; + s = t + 1; + } + intv->end = intv->beg + l - 1; + } + } else if ((idx->conf.preset&0xffff) == TI_PRESET_VCF) { + // FIXME: the following is NOT tested and is likely to be buggy + if (id == 5) { // ALT + char *t; + int max = 1; + for (s = str->s + b; s < str->s + i;) { + if (s[i] == 'D') { + long x = strtol(s + 1, &t, 10); + if (x > max) max = x; + s = t + 1; + } else ++s; + } + intv->end = intv->beg + max - 1; + } + } + } + b = i + 1; + ++id; + } + } + if (intv->tid < 0 || intv->beg < 0 || intv->end < 0) return -1; + intv->bin = ti_reg2bin(intv->beg-1, intv->end); + return 0; +} + +/************ + * indexing * + ************/ + +// requirement: len <= LEN_MASK +static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end) +{ + khint_t k; + ti_binlist_t *l; + int ret; + k = kh_put(i, h, bin, &ret); + l = &kh_value(h, k); + if (ret) { // not present + l->m = 1; l->n = 0; + l->list = (pair64_t*)calloc(l->m, 16); + } + if (l->n == l->m) { + l->m <<= 1; + l->list = (pair64_t*)realloc(l->list, l->m * 16); + } + l->list[l->n].u = beg; l->list[l->n++].v = end; +} + +static inline void insert_offset2(ti_lidx_t *index2, int _beg, int _end, uint64_t offset) +{ + int i, beg, end; + beg = _beg >> TAD_LIDX_SHIFT; + end = (_end - 1) >> TAD_LIDX_SHIFT; + if (index2->m < end + 1) { + int old_m = index2->m; + index2->m = end + 1; + kroundup32(index2->m); + index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); + memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); + } + for (i = beg + 1; i <= end; ++i) + if (index2->offset[i] == 0) index2->offset[i] = offset; + index2->n = end + 1; +} + +static void merge_chunks(ti_index_t *idx) +{ + khash_t(i) *index; + int i, l, m; + khint_t k; + for (i = 0; i < idx->n; ++i) { + index = idx->index[i]; + for (k = kh_begin(index); k != kh_end(index); ++k) { + ti_binlist_t *p; + if (!kh_exist(index, k)) continue; + p = &kh_value(index, k); + m = 0; + for (l = 1; l < p->n; ++l) { + if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v; + else p->list[++m] = p->list[l]; + } // ~for(l) + p->n = m + 1; + } // ~for(k) + } // ~for(i) +} + +ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf) +{ + int ret; + ti_index_t *idx; + uint32_t last_bin, save_bin; + int32_t last_coor, last_tid, save_tid; + uint64_t save_off, last_off, lineno = 0; + kstring_t *str; + + str = calloc(1, sizeof(kstring_t)); + + idx = (ti_index_t*)calloc(1, sizeof(ti_index_t)); + idx->conf = *conf; + idx->n = idx->max = 0; + idx->tname = kh_init(s); + idx->index = 0; + idx->index2 = 0; + + save_bin = save_tid = last_tid = last_bin = 0xffffffffu; + save_off = last_off = bgzf_tell(fp); last_coor = 0xffffffffu; + while ((ret = ti_readline(fp, str)) >= 0) { + ti_intv_t intv; + ++lineno; + if (lineno <= idx->conf.line_skip || str->s[0] == idx->conf.meta_char) { + last_off = bgzf_tell(fp); + continue; + } + get_intv(idx, str, &intv); + if (last_tid != intv.tid) { // change of chromosomes + last_tid = intv.tid; + last_bin = 0xffffffffu; + } else if (last_coor > intv.beg) { + fprintf(stderr, "[ti_index_core] the file is not sorted at line %llu\n", (unsigned long long)lineno); + exit(1); + } + if (intv.bin < 4681) insert_offset2(&idx->index2[intv.tid], intv.beg, intv.end, last_off); + if (intv.bin != last_bin) { // then possibly write the binning index + if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record + insert_offset(idx->index[save_tid], save_bin, save_off, last_off); + save_off = last_off; + save_bin = last_bin = intv.bin; + save_tid = intv.tid; + if (save_tid < 0) break; + } + if (bgzf_tell(fp) <= last_off) { + fprintf(stderr, "[ti_index_core] bug in BGZF: %llx < %llx\n", + (unsigned long long)bgzf_tell(fp), (unsigned long long)last_off); + exit(1); + } + last_off = bgzf_tell(fp); + last_coor = intv.beg; + } + if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bgzf_tell(fp)); + merge_chunks(idx); + + free(str->s); free(str); + return idx; +} + +void ti_index_destroy(ti_index_t *idx) +{ + khint_t k; + int i; + if (idx == 0) return; + // destroy the name hash table + for (k = kh_begin(idx->tname); k != kh_end(idx->tname); ++k) { + if (kh_exist(idx->tname, k)) + free((char*)kh_key(idx->tname, k)); + } + kh_destroy(s, idx->tname); + // destroy the binning index + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index = idx->index[i]; + ti_lidx_t *index2 = idx->index2 + i; + for (k = kh_begin(index); k != kh_end(index); ++k) { + if (kh_exist(index, k)) + free(kh_value(index, k).list); + } + kh_destroy(i, index); + free(index2->offset); + } + free(idx->index); + // destroy the linear index + free(idx->index2); + free(idx); +} + +/****************** + * index file I/O * + ******************/ + +void ti_index_save(const ti_index_t *idx, BGZF *fp) +{ + int32_t i, size, ti_is_be; + khint_t k; + ti_is_be = bam_is_big_endian(); + bgzf_write(fp, "TBI\1", 4); + if (ti_is_be) { + uint32_t x = idx->n; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &idx->n, 4); + assert(sizeof(ti_conf_t) == 24); + if (ti_is_be) { // write ti_conf_t; + uint32_t x[6]; + memcpy(x, &idx->conf, 24); + for (i = 0; i < 6; ++i) bgzf_write(fp, bam_swap_endian_4p(&x[i]), 4); + } else bgzf_write(fp, &idx->conf, sizeof(ti_conf_t)); + { // write target names + char **name; + int32_t l = 0; + name = calloc(kh_size(idx->tname), sizeof(void*)); + for (k = kh_begin(idx->tname); k != kh_end(idx->tname); ++k) + if (kh_exist(idx->tname, k)) + name[kh_value(idx->tname, k)] = (char*)kh_key(idx->tname, k); + for (i = 0; i < kh_size(idx->tname); ++i) + l += strlen(name[i]) + 1; + if (ti_is_be) bgzf_write(fp, bam_swap_endian_4p(&l), 4); + else bgzf_write(fp, &l, 4); + for (i = 0; i < kh_size(idx->tname); ++i) + bgzf_write(fp, name[i], strlen(name[i]) + 1); + free(name); + } + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index = idx->index[i]; + ti_lidx_t *index2 = idx->index2 + i; + // write binning index + size = kh_size(index); + if (ti_is_be) { // big endian + uint32_t x = size; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &size, 4); + for (k = kh_begin(index); k != kh_end(index); ++k) { + if (kh_exist(index, k)) { + ti_binlist_t *p = &kh_value(index, k); + if (ti_is_be) { // big endian + uint32_t x; + x = kh_key(index, k); bgzf_write(fp, bam_swap_endian_4p(&x), 4); + x = p->n; bgzf_write(fp, bam_swap_endian_4p(&x), 4); + for (x = 0; (int)x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + bgzf_write(fp, p->list, 16 * p->n); + for (x = 0; (int)x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + } else { + bgzf_write(fp, &kh_key(index, k), 4); + bgzf_write(fp, &p->n, 4); + bgzf_write(fp, p->list, 16 * p->n); + } + } + } + // write linear index (index2) + if (ti_is_be) { + int x = index2->n; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &index2->n, 4); + if (ti_is_be) { // big endian + int x; + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + bgzf_write(fp, index2->offset, 8 * index2->n); + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + } else bgzf_write(fp, index2->offset, 8 * index2->n); + } +} + +static ti_index_t *ti_index_load_core(BGZF *fp) +{ + int i, ti_is_be; + char magic[4]; + ti_index_t *idx; + ti_is_be = bam_is_big_endian(); + if (fp == 0) { + fprintf(stderr, "[ti_index_load_core] fail to load index.\n"); + return 0; + } + bgzf_read(fp, magic, 4); + if (strncmp(magic, "TBI\1", 4)) { + fprintf(stderr, "[ti_index_load] wrong magic number.\n"); + return 0; + } + idx = (ti_index_t*)calloc(1, sizeof(ti_index_t)); + bgzf_read(fp, &idx->n, 4); + if (ti_is_be) bam_swap_endian_4p(&idx->n); + idx->tname = kh_init(s); + idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); + idx->index2 = (ti_lidx_t*)calloc(idx->n, sizeof(ti_lidx_t)); + // read idx->conf + bgzf_read(fp, &idx->conf, sizeof(ti_conf_t)); + if (ti_is_be) { + bam_swap_endian_4p(&idx->conf.preset); + bam_swap_endian_4p(&idx->conf.sc); + bam_swap_endian_4p(&idx->conf.bc); + bam_swap_endian_4p(&idx->conf.ec); + bam_swap_endian_4p(&idx->conf.meta_char); + bam_swap_endian_4p(&idx->conf.line_skip); + } + { // read target names + int j, ret; + kstring_t *str; + int32_t l; + uint8_t *buf; + bgzf_read(fp, &l, 4); + if (ti_is_be) bam_swap_endian_4p(&l); + buf = calloc(l, 1); + bgzf_read(fp, buf, l); + str = calloc(1, sizeof(kstring_t)); + for (i = j = 0; i < l; ++i) { + if (buf[i] == 0) { + khint_t k = kh_put(s, idx->tname, strdup(str->s), &ret); + kh_value(idx->tname, k) = j++; + str->l = 0; + } else kputc(buf[i], str); + } + free(str->s); free(str); free(buf); + } + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index; + ti_lidx_t *index2 = idx->index2 + i; + uint32_t key, size; + khint_t k; + int j, ret; + ti_binlist_t *p; + index = idx->index[i] = kh_init(i); + // load binning index + bgzf_read(fp, &size, 4); + if (ti_is_be) bam_swap_endian_4p(&size); + for (j = 0; j < (int)size; ++j) { + bgzf_read(fp, &key, 4); + if (ti_is_be) bam_swap_endian_4p(&key); + k = kh_put(i, index, key, &ret); + p = &kh_value(index, k); + bgzf_read(fp, &p->n, 4); + if (ti_is_be) bam_swap_endian_4p(&p->n); + p->m = p->n; + p->list = (pair64_t*)malloc(p->m * 16); + bgzf_read(fp, p->list, 16 * p->n); + if (ti_is_be) { + int x; + for (x = 0; x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + } + } + // load linear index + bgzf_read(fp, &index2->n, 4); + if (ti_is_be) bam_swap_endian_4p(&index2->n); + index2->m = index2->n; + index2->offset = (uint64_t*)calloc(index2->m, 8); + bgzf_read(fp, index2->offset, index2->n * 8); + if (ti_is_be) + for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); + } + return idx; +} + +ti_index_t *ti_index_load_local(const char *_fn) +{ + BGZF *fp; + char *fnidx, *fn; + + if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) { + const char *p; + int l = strlen(_fn); + for (p = _fn + l - 1; p >= _fn; --p) + if (*p == '/') break; + fn = strdup(p + 1); + } else fn = strdup(_fn); + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".tbi"); + fp = bgzf_open(fnidx, "r"); + free(fnidx); free(fn); + if (fp) { + ti_index_t *idx = ti_index_load_core(fp); + bgzf_close(fp); + return idx; + } else return 0; +} + +#ifdef _USE_KNETFILE +static void download_from_remote(const char *url) +{ + const int buf_size = 1 * 1024 * 1024; + char *fn; + FILE *fp; + uint8_t *buf; + knetFile *fp_remote; + int l; + if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return; + l = strlen(url); + for (fn = (char*)url + l - 1; fn >= url; --fn) + if (*fn == '/') break; + ++fn; // fn now points to the file name + fp_remote = knet_open(url, "r"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); + return; + } + if ((fp = fopen(fn, "w")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); + knet_close(fp_remote); + return; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + knet_close(fp_remote); +} +#else +static void download_from_remote(const char *url) +{ + return; +} +#endif + +ti_index_t *ti_index_load(const char *fn) +{ + ti_index_t *idx; + idx = ti_index_load_local(fn); + if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) { + char *fnidx = calloc(strlen(fn) + 5, 1); + strcat(strcpy(fnidx, fn), ".tbi"); + fprintf(stderr, "[ti_index_load] attempting to download the remote index file.\n"); + download_from_remote(fnidx); + idx = ti_index_load_local(fn); + } + if (idx == 0) fprintf(stderr, "[ti_index_load] fail to load BAM index.\n"); + return idx; +} + +int ti_index_build2(const char *fn, const ti_conf_t *conf, const char *_fnidx) +{ + char *fnidx; + BGZF *fp, *fpidx; + ti_index_t *idx; + if ((fp = bgzf_open(fn, "r")) == 0) { + fprintf(stderr, "[ti_index_build2] fail to open the BAM file.\n"); + return -1; + } + idx = ti_index_core(fp, conf); + bgzf_close(fp); + if (_fnidx == 0) { + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".tbi"); + } else fnidx = strdup(_fnidx); + fpidx = bgzf_open(fnidx, "w"); + if (fpidx == 0) { + fprintf(stderr, "[ti_index_build2] fail to create the index file.\n"); + free(fnidx); + return -1; + } + ti_index_save(idx, fpidx); + ti_index_destroy(idx); + bgzf_close(fpidx); + free(fnidx); + return 0; +} + +int ti_index_build(const char *fn, const ti_conf_t *conf) +{ + return ti_index_build2(fn, conf, 0); +} + +/******************************************** + * parse a region in the format chr:beg-end * + ********************************************/ + +int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int *end) +{ + char *s, *p; + int i, l, k; + khiter_t iter; + khash_t(s) *h = idx->tname; + l = strlen(str); + p = s = (char*)malloc(l+1); + /* squeeze out "," */ + for (i = k = 0; i != l; ++i) + if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; + s[k] = 0; + for (i = 0; i != k; ++i) if (s[i] == ':') break; + s[i] = 0; + iter = kh_get(s, h, s); /* get the tid */ + if (iter == kh_end(h)) { // name not found + *tid = -1; free(s); + return -1; + } + *tid = kh_value(h, iter); + if (i == k) { /* dump the whole sequence */ + *begin = 0; *end = 1<<29; free(s); + return 0; + } + for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; + *begin = atoi(p); + if (i < k) { + p = s + i + 1; + *end = atoi(p); + } else *end = 1<<29; + if (*begin > 0) --*begin; + free(s); + if (*begin > *end) return -1; + return 0; +} + +/******************************* + * retrieve a specified region * + *******************************/ + +#define MAX_BIN 37450 // =(8^6-1)/7+1 + +static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN]) +{ + int i = 0, k; + --end; + list[i++] = 0; + for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; + for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k; + for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k; + for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k; + for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k; + return i; +} + +static inline int is_overlap(int32_t beg, int32_t end, int rbeg, int rend) +{ + return (rend > beg && rbeg < end); +} + +// ti_fetch helper function retrieves +pair64_t *get_chunk_coordinates(const ti_index_t *idx, int tid, int beg, int end, int* cnt_off) +{ + uint16_t *bins; + int i, n_bins, n_off; + pair64_t *off; + khint_t k; + khash_t(i) *index; + uint64_t min_off; + + bins = (uint16_t*)calloc(MAX_BIN, 2); + n_bins = reg2bins(beg, end, bins); + index = idx->index[tid]; + min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT]; + for (i = n_off = 0; i < n_bins; ++i) { + if ((k = kh_get(i, index, bins[i])) != kh_end(index)) + n_off += kh_value(index, k).n; + } + if (n_off == 0) { + free(bins); return 0; + } + off = (pair64_t*)calloc(n_off, 16); + for (i = n_off = 0; i < n_bins; ++i) { + if ((k = kh_get(i, index, bins[i])) != kh_end(index)) { + int j; + ti_binlist_t *p = &kh_value(index, k); + for (j = 0; j < p->n; ++j) + if (p->list[j].v > min_off) off[n_off++] = p->list[j]; + } + } + free(bins); + { + int l; + ks_introsort(off, n_off, off); + // resolve completely contained adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) + if (off[l].v < off[i].v) + off[++l] = off[i]; + n_off = l + 1; + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + for (i = 1; i < n_off; ++i) + if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; + { // merge adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) { + if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; + else off[++l] = off[i]; + } + n_off = l + 1; + } + } + *cnt_off = n_off; + return off; +} + +int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func) +{ + int n_off; + pair64_t *off = get_chunk_coordinates(idx, tid, beg, end, &n_off); + if (off == 0) return 0; + { + // retrive alignments + uint64_t curr_off; + int i, ret, n_seeks; + kstring_t *str = calloc(1, sizeof(kstring_t)); + n_seeks = 0; i = -1; curr_off = 0; + for (;;) { + if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk + if (i == n_off - 1) break; // no more chunks + if (i >= 0) assert(curr_off == off[i].v); // otherwise bug + if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek + bgzf_seek(fp, off[i+1].u, SEEK_SET); + curr_off = bgzf_tell(fp); + ++n_seeks; + } + ++i; + } + if ((ret = ti_readline(fp, str)) >= 0) { + ti_intv_t intv; + curr_off = bgzf_tell(fp); + if (str->s[0] == idx->conf.meta_char) continue; + get_intv((ti_index_t*)idx, str, &intv); + if (intv.tid != tid || intv.beg >= end) break; // no need to proceed + else if (is_overlap(beg, end, intv.beg, intv.end)) func(str->l, str->s, data); + } else break; // end of file + } +// fprintf(stderr, "[ti_fetch] # seek calls: %d\n", n_seeks); + free(str->s); free(str); + } + free(off); + return 0; +} diff --git a/khash.h b/khash.h new file mode 100644 index 0000000..1d583ef --- /dev/null +++ b/khash.h @@ -0,0 +1,486 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + if (!ret) kh_del(32, h, k); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + + @copyright Heng Li + */ + +#define AC_VERSION_KHASH_H "0.2.2" + +#include +#include +#include + +typedef uint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_HASH_PRIME_SIZE 32 +static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = +{ + 0ul, 3ul, 11ul, 23ul, 53ul, + 97ul, 193ul, 389ul, 769ul, 1543ul, + 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, + 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, + 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, + 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, + 3221225473ul, 4294967291ul +}; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +static const double __ac_HASH_UPPER = 0.77; + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + uint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + static inline kh_##name##_t *kh_init_##name() { \ + return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + } \ + static inline void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + free(h->keys); free(h->flags); \ + free(h->vals); \ + free(h); \ + } \ + } \ + static inline void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t inc, k, i, last; \ + k = __hash_func(key); i = k % h->n_buckets; \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { \ + uint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + khint_t t = __ac_HASH_PRIME_SIZE - 1; \ + while (__ac_prime_list[t] > new_n_buckets) --t; \ + new_n_buckets = __ac_prime_list[t+1]; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ + else { \ + new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ + memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ + if (h->n_buckets < new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + } \ + } \ + if (j) { \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { \ + khint_t inc, k, i; \ + k = __hash_func(key); \ + i = k % new_n_buckets; \ + inc = 1 + k % (new_n_buckets - 1); \ + while (!__ac_isempty(new_flags, i)) { \ + if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ + else i += inc; \ + } \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); \ + } else { \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + free(h->flags); \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + } \ + static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { \ + if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ + else kh_resize_##name(h, h->n_buckets + 1); \ + } \ + { \ + khint_t inc, k, i, site, last; \ + x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ + if (__ac_isempty(h->flags, i)) x = i; \ + else { \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; \ + return x; \ + } \ + static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [uint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (uint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [uint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other necessary macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/knetfile.c b/knetfile.c new file mode 100644 index 0000000..994babb --- /dev/null +++ b/knetfile.c @@ -0,0 +1,632 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* Probably I will not do socket programming in the next few years and + therefore I decide to heavily annotate this file, for Linux and + Windows as well. -lh3 */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#else +#include +#include +#include +#endif + +#include "knetfile.h" + +/* In winsock.h, the type of a socket is SOCKET, which is: "typedef + * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed + * integer -1. In knetfile.c, I use "int" for socket type + * throughout. This should be improved to avoid confusion. + * + * In Linux/Mac, recv() and read() do almost the same thing. You can see + * in the header file that netread() is simply an alias of read(). In + * Windows, however, they are different and using recv() is mandatory. + */ + +/* This function tests if the file handler is ready for reading (or + * writing if is_read==0). */ +static int socket_wait(int fd, int is_read) +{ + fd_set fds, *fdr = 0, *fdw = 0; + struct timeval tv; + int ret; + tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (is_read) fdr = &fds; + else fdw = &fds; + ret = select(fd+1, fdr, fdw, 0, &tv); +#ifndef _WIN32 + if (ret == -1) perror("select"); +#else + if (ret == 0) + fprintf(stderr, "select time-out\n"); + else if (ret == SOCKET_ERROR) + fprintf(stderr, "select: %d\n", WSAGetLastError()); +#endif + return ret; +} + +#ifndef _WIN32 +/* This function does not work with Windows due to the lack of + * getaddrinfo() in winsock. It is addapted from an example in "Beej's + * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ +static int socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) + + int on = 1, fd; + struct linger lng = { 0, 0 }; + struct addrinfo hints, *res; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + /* In Unix/Mac, getaddrinfo() is the most convenient way to get + * server information. */ + if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); + if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); + /* The following two setsockopt() are used by ftplib + * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they + * necessary. */ + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); + freeaddrinfo(res); + return fd; +} +#else +/* MinGW's printf has problem with "%lld" */ +char *int64tostr(char *buf, int64_t x) +{ + int cnt; + int i = 0; + do { + buf[i++] = '0' + x % 10; + x /= 10; + } while (x); + buf[i] = 0; + for (cnt = i, i = 0; i < cnt/2; ++i) { + int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; + } + return buf; +} + +int64_t strtoint64(const char *buf) +{ + int64_t x; + for (x = 0; *buf != '\0'; ++buf) + x = x * 10 + ((int64_t) *buf - 48); + return x; +} +/* In windows, the first thing is to establish the TCP connection. */ +int knet_win32_init() +{ + WSADATA wsaData; + return WSAStartup(MAKEWORD(2, 2), &wsaData); +} +void knet_win32_destroy() +{ + WSACleanup(); +} +/* A slightly modfied version of the following function also works on + * Mac (and presummably Linux). However, this function is not stable on + * my Mac. It sometimes works fine but sometimes does not. Therefore for + * non-Windows OS, I do not use this one. */ +static SOCKET socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) \ + do { \ + fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \ + return -1; \ + } while (0) + + int on = 1; + SOCKET fd; + struct linger lng = { 0, 0 }; + struct sockaddr_in server; + struct hostent *hp = 0; + // open socket + if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + // get host info + if (isalpha(host[0])) hp = gethostbyname(host); + else { + struct in_addr addr; + addr.s_addr = inet_addr(host); + hp = gethostbyaddr((char*)&addr, 4, AF_INET); + } + if (hp == 0) __err_connect("gethost"); + // connect + server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); + server.sin_family= AF_INET; + server.sin_port = htons(atoi(port)); + if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); + // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) + return fd; +} +#endif + +static off_t my_netread(int fd, void *buf, off_t len) +{ + off_t rest = len, curr, l = 0; + /* recv() and read() may not read the required length of data with + * one call. They have to be called repeatedly. */ + while (rest) { + if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading + curr = netread(fd, buf + l, rest); + /* According to the glibc manual, section 13.2, a zero returned + * value indicates end-of-file (EOF), which should mean that + * read() will not return zero if EOF has not been met but data + * are not immediately available. */ + if (curr == 0) break; + l += curr; rest -= curr; + } + return l; +} + +/************************* + * FTP specific routines * + *************************/ + +static int kftp_get_response(knetFile *ftp) +{ +#ifndef _WIN32 + unsigned char c; +#else + char c; +#endif + int n = 0; + char *p; + if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; + while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O + //fputc(c, stderr); + if (n >= ftp->max_response) { + ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; + ftp->response = realloc(ftp->response, ftp->max_response); + } + ftp->response[n++] = c; + if (c == '\n') { + if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2]) + && ftp->response[3] != '-') break; + n = 0; + continue; + } + } + if (n < 2) return -1; + ftp->response[n-2] = 0; + return strtol(ftp->response, &p, 0); +} + +static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) +{ + if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing + netwrite(ftp->ctrl_fd, cmd, strlen(cmd)); + return is_get? kftp_get_response(ftp) : 0; +} + +static int kftp_pasv_prep(knetFile *ftp) +{ + char *p; + int v[6]; + kftp_send_cmd(ftp, "PASV\r\n", 1); + for (p = ftp->response; *p && *p != '('; ++p); + if (*p != '(') return -1; + ++p; + sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); + memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); + ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; + return 0; +} + + +static int kftp_pasv_connect(knetFile *ftp) +{ + char host[80], port[10]; + if (ftp->pasv_port == 0) { + fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n"); + return -1; + } + sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); + sprintf(port, "%d", ftp->pasv_port); + ftp->fd = socket_connect(host, port); + if (ftp->fd == -1) return -1; + return 0; +} + +int kftp_connect(knetFile *ftp) +{ + ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); + if (ftp->ctrl_fd == -1) return -1; + kftp_get_response(ftp); + kftp_send_cmd(ftp, "USER anonymous\r\n", 1); + kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); + kftp_send_cmd(ftp, "TYPE I\r\n", 1); + return 0; +} + +int kftp_reconnect(knetFile *ftp) +{ + if (ftp->ctrl_fd != -1) { + netclose(ftp->ctrl_fd); + ftp->ctrl_fd = -1; + } + netclose(ftp->fd); + ftp->fd = -1; + return kftp_connect(ftp); +} + +// initialize ->type, ->host, ->retr and ->size +knetFile *kftp_parse_url(const char *fn, const char *mode) +{ + knetFile *fp; + char *p; + int l; + if (strstr(fn, "ftp://") != fn) return 0; + for (p = (char*)fn + 6; *p && *p != '/'; ++p); + if (*p != '/') return 0; + l = p - fn - 6; + fp = calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_FTP; + fp->fd = -1; + /* the Linux/Mac version of socket_connect() also recognizes a port + * like "ftp", but the Windows version does not. */ + fp->port = strdup("21"); + fp->host = calloc(l + 1, 1); + if (strchr(mode, 'c')) fp->no_reconnect = 1; + strncpy(fp->host, fn + 6, l); + fp->retr = calloc(strlen(p) + 8, 1); + sprintf(fp->retr, "RETR %s\r\n", p); + fp->size_cmd = calloc(strlen(p) + 8, 1); + sprintf(fp->size_cmd, "SIZE %s\r\n", p); + fp->seek_offset = 0; + return fp; +} +// place ->fd at offset off +int kftp_connect_file(knetFile *fp) +{ + int ret; + long long file_size; + if (fp->fd != -1) { + netclose(fp->fd); + if (fp->no_reconnect) kftp_get_response(fp); + } + kftp_pasv_prep(fp); + kftp_send_cmd(fp, fp->size_cmd, 1); +#ifndef _WIN32 + if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) + { + fprintf(stderr,"[kftp_connect_file] %s\n", fp->response); + return -1; + } +#else + const char *p = fp->response; + while (*p != ' ') ++p; + while (*p < '0' || *p > '9') ++p; + file_size = strtoint64(p); +#endif + fp->file_size = file_size; + if (fp->offset>=0) { + char tmp[32]; +#ifndef _WIN32 + sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); +#else + strcpy(tmp, "REST "); + int64tostr(tmp + 5, fp->offset); + strcat(tmp, "\r\n"); +#endif + kftp_send_cmd(fp, tmp, 1); + } + kftp_send_cmd(fp, fp->retr, 0); + kftp_pasv_connect(fp); + ret = kftp_get_response(fp); + if (ret != 150) { + fprintf(stderr, "[kftp_connect_file] %s\n", fp->response); + netclose(fp->fd); + fp->fd = -1; + return -1; + } + fp->is_ready = 1; + return 0; +} + + +/************************** + * HTTP specific routines * + **************************/ + +knetFile *khttp_parse_url(const char *fn, const char *mode) +{ + knetFile *fp; + char *p, *proxy, *q; + int l; + if (strstr(fn, "http://") != fn) return 0; + // set ->http_host + for (p = (char*)fn + 7; *p && *p != '/'; ++p); + l = p - fn - 7; + fp = calloc(1, sizeof(knetFile)); + fp->http_host = calloc(l + 1, 1); + strncpy(fp->http_host, fn + 7, l); + fp->http_host[l] = 0; + for (q = fp->http_host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + // get http_proxy + proxy = getenv("http_proxy"); + // set ->host, ->port and ->path + if (proxy == 0) { + fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. + fp->port = strdup(*q? q : "80"); + fp->path = strdup(*p? p : "/"); + } else { + fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); + for (q = fp->host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + fp->port = strdup(*q? q : "80"); + fp->path = strdup(fn); + } + fp->type = KNF_TYPE_HTTP; + fp->ctrl_fd = fp->fd = -1; + fp->seek_offset = 0; + return fp; +} + +int khttp_connect_file(knetFile *fp) +{ + int ret, l = 0; + char *buf, *p; + if (fp->fd != -1) netclose(fp->fd); + fp->fd = socket_connect(fp->host, fp->port); + buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. + l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); + l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); + l += sprintf(buf + l, "\r\n"); + netwrite(fp->fd, buf, l); + l = 0; + while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency + if (buf[l] == '\n' && l >= 3) + if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; + ++l; + } + buf[l] = 0; + if (l < 14) { // prematured header + netclose(fp->fd); + fp->fd = -1; + return -1; + } + ret = strtol(buf + 8, &p, 0); // HTTP return code + if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file + off_t rest = fp->offset; + while (rest) { + off_t l = rest < 0x10000? rest : 0x10000; + rest -= my_netread(fp->fd, buf, l); + } + } else if (ret != 206 && ret != 200) { + free(buf); + fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret); + netclose(fp->fd); + fp->fd = -1; + return -1; + } + free(buf); + fp->is_ready = 1; + return 0; +} + +/******************** + * Generic routines * + ********************/ + +knetFile *knet_open(const char *fn, const char *mode) +{ + knetFile *fp = 0; + if (mode[0] != 'r') { + fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); + return 0; + } + if (strstr(fn, "ftp://") == fn) { + fp = kftp_parse_url(fn, mode); + if (fp == 0) return 0; + if (kftp_connect(fp) == -1) { + knet_close(fp); + return 0; + } + kftp_connect_file(fp); + } else if (strstr(fn, "http://") == fn) { + fp = khttp_parse_url(fn, mode); + if (fp == 0) return 0; + khttp_connect_file(fp); + } else { // local file +#ifdef _WIN32 + /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may + * be undefined on some systems, although it is defined on my + * Mac and the Linux I have tested on. */ + int fd = open(fn, O_RDONLY | O_BINARY); +#else + int fd = open(fn, O_RDONLY); +#endif + if (fd == -1) { + perror("open"); + return 0; + } + fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + fp->ctrl_fd = -1; + } + if (fp && fp->fd == -1) { + knet_close(fp); + return 0; + } + return fp; +} + +knetFile *knet_dopen(int fd, const char *mode) +{ + knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + return fp; +} + +off_t knet_read(knetFile *fp, void *buf, off_t len) +{ + off_t l = 0; + if (fp->fd == -1) return 0; + if (fp->type == KNF_TYPE_FTP) { + if (fp->is_ready == 0) { + if (!fp->no_reconnect) kftp_reconnect(fp); + kftp_connect_file(fp); + } + } else if (fp->type == KNF_TYPE_HTTP) { + if (fp->is_ready == 0) + khttp_connect_file(fp); + } + if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX + off_t rest = len, curr; + while (rest) { + curr = read(fp->fd, buf + l, rest); + if (curr == 0) break; + l += curr; rest -= curr; + } + } else l = my_netread(fp->fd, buf, len); + fp->offset += l; + return l; +} + +off_t knet_seek(knetFile *fp, int64_t off, int whence) +{ + if (whence == SEEK_SET && off == fp->offset) return 0; + if (fp->type == KNF_TYPE_LOCAL) { + /* Be aware that lseek() returns the offset after seeking, + * while fseek() returns zero on success. */ + off_t offset = lseek(fp->fd, off, whence); + if (offset == -1) { + // Be silent, it is OK for knet_seek to fail when the file is streamed + // fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); + return -1; + } + fp->offset = offset; + return 0; + } + else if (fp->type == KNF_TYPE_FTP) + { + if (whence==SEEK_CUR) + fp->offset += off; + else if (whence==SEEK_SET) + fp->offset = off; + else if ( whence==SEEK_END) + fp->offset = fp->file_size+off; + fp->is_ready = 0; + return 0; + } + else if (fp->type == KNF_TYPE_HTTP) + { + if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? + fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n"); + errno = ESPIPE; + return -1; + } + if (whence==SEEK_CUR) + fp->offset += off; + else if (whence==SEEK_SET) + fp->offset = off; + fp->is_ready = 0; + return fp->offset; + } + errno = EINVAL; + fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); + return -1; +} + +int knet_close(knetFile *fp) +{ + if (fp == 0) return 0; + if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific + if (fp->fd != -1) { + /* On Linux/Mac, netclose() is an alias of close(), but on + * Windows, it is an alias of closesocket(). */ + if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); + else netclose(fp->fd); + } + free(fp->host); free(fp->port); + free(fp->response); free(fp->retr); // FTP specific + free(fp->path); free(fp->http_host); // HTTP specific + free(fp); + return 0; +} + +#ifdef KNETFILE_MAIN +int main(void) +{ + char *buf; + knetFile *fp; + int type = 4, l; +#ifdef _WIN32 + knet_win32_init(); +#endif + buf = calloc(0x100000, 1); + if (type == 0) { + fp = knet_open("knetfile.c", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 1) { // NCBI FTP, large file + fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); + knet_seek(fp, 2500000000ll, SEEK_SET); + l = knet_read(fp, buf, 255); + } else if (type == 2) { + fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 3) { + fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 4) { + fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); + knet_read(fp, buf, 10000); + knet_seek(fp, 20000, SEEK_SET); + knet_seek(fp, 10000, SEEK_SET); + l = knet_read(fp, buf+10000, 10000000) + 10000; + } + if (type != 4 && type != 1) { + knet_read(fp, buf, 255); + buf[255] = 0; + printf("%s\n", buf); + } else write(fileno(stdout), buf, l); + knet_close(fp); + free(buf); + return 0; +} +#endif diff --git a/knetfile.h b/knetfile.h new file mode 100644 index 0000000..0a0e66f --- /dev/null +++ b/knetfile.h @@ -0,0 +1,75 @@ +#ifndef KNETFILE_H +#define KNETFILE_H + +#include +#include + +#ifndef _WIN32 +#define netread(fd, ptr, len) read(fd, ptr, len) +#define netwrite(fd, ptr, len) write(fd, ptr, len) +#define netclose(fd) close(fd) +#else +#include +#define netread(fd, ptr, len) recv(fd, ptr, len, 0) +#define netwrite(fd, ptr, len) send(fd, ptr, len, 0) +#define netclose(fd) closesocket(fd) +#endif + +// FIXME: currently I/O is unbuffered + +#define KNF_TYPE_LOCAL 1 +#define KNF_TYPE_FTP 2 +#define KNF_TYPE_HTTP 3 + +typedef struct knetFile_s { + int type, fd; + int64_t offset; + char *host, *port; + + // the following are for FTP only + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; // for lazy seek + int64_t file_size; + + // the following are for HTTP only + char *path, *http_host; +} knetFile; + +#define knet_tell(fp) ((fp)->offset) +#define knet_fileno(fp) ((fp)->fd) + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 + int knet_win32_init(); + void knet_win32_destroy(); +#endif + + knetFile *knet_open(const char *fn, const char *mode); + + /* + This only works with local files. + */ + knetFile *knet_dopen(int fd, const char *mode); + + /* + If ->is_ready==0, this routine updates ->fd; otherwise, it simply + reads from ->fd. + */ + off_t knet_read(knetFile *fp, void *buf, off_t len); + + /* + This routine only sets ->offset and ->is_ready=0. It does not + communicate with the FTP server. + */ + off_t knet_seek(knetFile *fp, int64_t off, int whence); + int knet_close(knetFile *fp); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/ksort.h b/ksort.h new file mode 100644 index 0000000..16a03fd --- /dev/null +++ b/ksort.h @@ -0,0 +1,271 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + 2008-11-16 (0.1.4): + + * Fixed a bug in introsort() that happens in rare cases. + + 2008-11-05 (0.1.3): + + * Fixed a bug in introsort() for complex comparisons. + + * Fixed a bug in mergesort(). The previous version is not stable. + + 2008-09-15 (0.1.2): + + * Accelerated introsort. On my Mac (not on another Linux machine), + my implementation is as fast as std::sort on random input. + + * Added combsort and in introsort, switch to combsort if the + recursion is too deep. + + 2008-09-13 (0.1.1): + + * Added k-small algorithm + + 2008-09-05 (0.1.0): + + * Initial version + +*/ + +#ifndef AC_KSORT_H +#define AC_KSORT_H + +#include +#include + +typedef struct { + void *left, *right; + int depth; +} ks_isort_stack_t; + +#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } + +#define KSORT_INIT(name, type_t, __sort_lt) \ + void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ + { \ + type_t *a2[2], *a, *b; \ + int curr, shift; \ + \ + a2[0] = array; \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ + ks_heapadjust_##name(i, lsize, l); \ + } \ + void ks_heapsort_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = lsize - 1; i > 0; --i) { \ + type_t tmp; \ + tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ + } \ + } \ + inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + { \ + type_t *i, *j, swap_tmp; \ + for (i = s + 1; i < t; ++i) \ + for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ + swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ + } \ + } \ + void ks_combsort_##name(size_t n, type_t a[]) \ + { \ + const double shrink_factor = 1.2473309501039786540366528676643; \ + int do_swap; \ + size_t gap = n; \ + type_t tmp, *i, *j; \ + do { \ + if (gap > 2) { \ + gap = (size_t)(gap / shrink_factor); \ + if (gap == 9 || gap == 10) gap = 11; \ + } \ + do_swap = 0; \ + for (i = a; i < a + n - gap; ++i) { \ + j = i + gap; \ + if (__sort_lt(*j, *i)) { \ + tmp = *i; *i = *j; *j = tmp; \ + do_swap = 1; \ + } \ + } \ + } while (do_swap || gap > 2); \ + if (gap != 1) __ks_insertsort_##name(a, a + n); \ + } \ + void ks_introsort_##name(size_t n, type_t a[]) \ + { \ + int d; \ + ks_isort_stack_t *top, *stack; \ + type_t rp, swap_tmp; \ + type_t *s, *t, *i, *j, *k; \ + \ + if (n < 1) return; \ + else if (n == 2) { \ + if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ + return; \ + } \ + for (d = 2; 1ul<>1) + 1; \ + if (__sort_lt(*k, *i)) { \ + if (__sort_lt(*k, *j)) k = j; \ + } else k = __sort_lt(*j, *i)? i : j; \ + rp = *k; \ + if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ + for (;;) { \ + do ++i; while (__sort_lt(*i, rp)); \ + do --j; while (i <= j && __sort_lt(rp, *j)); \ + if (j <= i) break; \ + swap_tmp = *i; *i = *j; *j = swap_tmp; \ + } \ + swap_tmp = *i; *i = *t; *t = swap_tmp; \ + if (i-s > t-i) { \ + if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ + s = t-i > 16? i+1 : t; \ + } else { \ + if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ + t = i-s > 16? i-1 : s; \ + } \ + } else { \ + if (top == stack) { \ + free(stack); \ + __ks_insertsort_##name(a, a+n); \ + return; \ + } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ + } \ + } \ + } \ + /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ + /* 0 <= kk < n */ \ + type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ + { \ + type_t *low, *high, *k, *ll, *hh, *mid; \ + low = arr; high = arr + n - 1; k = arr + kk; \ + for (;;) { \ + if (high <= low) return *k; \ + if (high == low + 1) { \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + return *k; \ + } \ + mid = low + (high - low) / 2; \ + if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ + KSORT_SWAP(type_t, *mid, *(low+1)); \ + ll = low + 1; hh = high; \ + for (;;) { \ + do ++ll; while (__sort_lt(*ll, *low)); \ + do --hh; while (__sort_lt(*low, *hh)); \ + if (hh < ll) break; \ + KSORT_SWAP(type_t, *ll, *hh); \ + } \ + KSORT_SWAP(type_t, *low, *hh); \ + if (hh <= k) low = ll; \ + if (hh >= k) high = hh - 1; \ + } \ + } + +#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) +#define ks_introsort(name, n, a) ks_introsort_##name(n, a) +#define ks_combsort(name, n, a) ks_combsort_##name(n, a) +#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) +#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) +#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) +#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) + +#define ks_lt_generic(a, b) ((a) < (b)) +#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) + +typedef const char *ksstr_t; + +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) +#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) + +#endif diff --git a/kstring.c b/kstring.c new file mode 100644 index 0000000..e0203fa --- /dev/null +++ b/kstring.c @@ -0,0 +1,165 @@ +#include +#include +#include +#include +#include +#include "kstring.h" + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. + va_end(ap); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + } + va_end(ap); + s->l += l; + return l; +} + +// s MUST BE a null terminated string; l = strlen(s) +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) +{ + int i, n, max, last_char, last_start, *offsets, l; + n = 0; max = *_max; offsets = *_offsets; + l = strlen(s); + +#define __ksplit_aux do { \ + if (_offsets) { \ + s[i] = 0; \ + if (n == max) { \ + max = max? max<<1 : 2; \ + offsets = (int*)realloc(offsets, sizeof(int) * max); \ + } \ + offsets[n++] = last_start; \ + } else ++n; \ + } while (0) + + for (i = 0, last_char = last_start = 0; i <= l; ++i) { + if (delimiter == 0) { + if (isspace(s[i]) || s[i] == 0) { + if (isgraph(last_char)) __ksplit_aux; // the end of a field + } else { + if (isspace(last_char) || last_char == 0) last_start = i; + } + } else { + if (s[i] == delimiter || s[i] == 0) { + if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field + } else { + if (last_char == delimiter || last_char == 0) last_start = i; + } + } + last_char = s[i]; + } + *_max = max; *_offsets = offsets; + return n; +} + +/********************** + * Boyer-Moore search * + **********************/ + +// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html +int *ksBM_prep(const uint8_t *pat, int m) +{ + int i, *suff, *prep, *bmGs, *bmBc; + prep = calloc(m + 256, 1); + bmGs = prep; bmBc = prep + m; + { // preBmBc() + for (i = 0; i < 256; ++i) bmBc[i] = m; + for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; + } + suff = calloc(m, sizeof(int)); + { // suffixes() + int f = 0, g; + suff[m - 1] = m; + g = m - 1; + for (i = m - 2; i >= 0; --i) { + if (i > g && suff[i + m - 1 - f] < i - g) + suff[i] = suff[i + m - 1 - f]; + else { + if (i < g) g = i; + f = i; + while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; + suff[i] = f - g; + } + } + } + { // preBmGs() + int j = 0; + for (i = 0; i < m; ++i) bmGs[i] = m; + for (i = m - 1; i >= 0; --i) + if (suff[i] == i + 1) + for (; j < m - 1 - i; ++j) + if (bmGs[j] == m) + bmGs[j] = m - 1 - i; + for (i = 0; i <= m - 2; ++i) + bmGs[m - 1 - suff[i]] = m - 1 - i; + } + free(suff); + return prep; +} + +int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches) +{ + int i, j, *prep, *bmGs, *bmBc; + int *matches = 0, mm = 0, nm = 0; + prep = _prep? _prep : ksBM_prep(pat, m); + bmGs = prep; bmBc = prep + m; + j = 0; + while (j <= n - m) { + for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); + if (i < 0) { + if (nm == mm) { + mm = mm? mm<<1 : 1; + matches = realloc(matches, mm * sizeof(int)); + } + matches[nm++] = j; + j += bmGs[0]; + } else { + int max = bmBc[str[i+j]] - m + 1 + i; + if (max < bmGs[i]) max = bmGs[i]; + j += max; + } + } + *n_matches = nm; + if (_prep == 0) free(prep); + return matches; +} + +#ifdef KSTRING_MAIN +#include +int main() +{ + kstring_t *s; + int *fields, n, i; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + // test ksprintf() + ksprintf(s, " abcdefg: %d ", 100); + printf("'%s'\n", s->s); + // test ksplit() + fields = ksplit(s, 0, &n); + for (i = 0; i < n; ++i) + printf("field[%d] = '%s'\n", i, s->s + fields[i]); + free(s); + + { + static char *str = "abcdefgcdg"; + static char *pat = "cd"; + int n, *matches; + matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n); + printf("%d: \n", n); + for (i = 0; i < n; ++i) + printf("- %d\n", matches[i]); + free(matches); + } + return 0; +} +#endif diff --git a/kstring.h b/kstring.h new file mode 100644 index 0000000..f4e5a99 --- /dev/null +++ b/kstring.h @@ -0,0 +1,68 @@ +#ifndef KSTRING_H +#define KSTRING_H + +#include +#include +#include + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +int ksprintf(kstring_t *s, const char *fmt, ...); +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + +// calculate the auxiliary array, allocated by calloc() +int *ksBM_prep(const uint8_t *pat, int m); + +/* Search pat in str and returned the list of matches. The size of the + * list is returned as n_matches. _prep is the array returned by + * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */ +int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches); + +static inline int kputsn(const char *p, int l, kstring_t *s) +{ + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + strncpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; +} + +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +static inline int *ksplit(kstring_t *s, int delimiter, int *n) +{ + int max = 0, *offsets = 0; + *n = ksplit_core(s->s, delimiter, &max, &offsets); + return offsets; +} + +#endif diff --git a/main.c b/main.c new file mode 100644 index 0000000..faef7fb --- /dev/null +++ b/main.c @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include "bgzf.h" +#include "tabix.h" + +#define PACKAGE_VERSION "0.1.0 (r506)" + +static int fetch_func(int l, const char *s, void *data) +{ + printf("%s\n", s); + return 0; +} + +int main(int argc, char *argv[]) +{ + int c, skip = -1, meta = -1; + ti_conf_t conf = ti_conf_gff; + while ((c = getopt(argc, argv, "p:s:b:e:0S:c:")) >= 0) { + switch (c) { + case '0': conf.preset |= TI_FLAG_UCSC; break; + case 'S': skip = atoi(optarg); break; + case 'c': meta = optarg[0]; break; + case 'p': + if (strcmp(optarg, "gff") == 0) conf = ti_conf_gff; + else if (strcmp(optarg, "bed") == 0) conf = ti_conf_bed; + else if (strcmp(optarg, "sam") == 0) conf = ti_conf_sam; + else if (strcmp(optarg, "vcf") == 0) conf = ti_conf_vcf; + else if (strcmp(optarg, "psltbl") == 0) conf = ti_conf_psltbl; + else { + fprintf(stderr, "[main] unrecognized preset '%s'\n", optarg); + return 1; + } + break; + case 's': conf.sc = atoi(optarg); break; + case 'b': conf.bc = atoi(optarg); break; + case 'e': conf.ec = atoi(optarg); break; + } + } + if (skip >= 0) conf.line_skip = skip; + if (meta >= 0) conf.meta_char = meta; + if (optind == argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Program: tabix (TAB-delimited file InderXer)\n"); + fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION); + fprintf(stderr, "Usage: tabix [region1 [region2 [...]]]\n\n"); + fprintf(stderr, "Options: -p STR preset: gff, bed, sam, vcf, psltbl [gff]\n"); + fprintf(stderr, " -s INT sequence name column [1]\n"); + fprintf(stderr, " -b INT start column [4]\n"); + fprintf(stderr, " -e INT end column [5]\n"); + fprintf(stderr, " -S INT skip first INT lines [0]\n"); + fprintf(stderr, " -c CHAR symbol for comment/meta lines [#]\n"); + fprintf(stderr, " -0 zero-based coordinate\n"); + fprintf(stderr, "\n"); + return 1; + } + if (optind + 1 == argc) + return ti_index_build(argv[optind], &conf); + { // retrieve + BGZF *fp; + fp = bgzf_open(argv[optind], "r"); + if (fp == 0) { + fprintf(stderr, "[main] fail to open the data file.\n"); + return 1; + } + if (strcmp(argv[optind+1], ".") == 0) { // retrieve all + kstring_t *str = calloc(1, sizeof(kstring_t)); + while (ti_readline(fp, str) >= 0) { // FIXME: check return code for error + fputs(str->s, stdout); fputc('\n', stdout); + } + free(str->s); free(str); + } else { // retrieve from specified regions + ti_index_t *idx; + int i; + idx = ti_index_load(argv[optind]); + if (idx == 0) { + bgzf_close(fp); + fprintf(stderr, "[main] fail to load the index.\n"); + return 1; + } + for (i = optind + 1; i < argc; ++i) { + int tid, beg, end; + if (ti_parse_region(idx, argv[i], &tid, &beg, &end) == 0) { + ti_fetch(fp, idx, tid, beg, end, 0, fetch_func); + } else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n"); + } + ti_index_destroy(idx); + } + bgzf_close(fp); + } + return 0; +} diff --git a/tabix.1 b/tabix.1 new file mode 100644 index 0000000..6b6e2d6 --- /dev/null +++ b/tabix.1 @@ -0,0 +1,117 @@ +.TH tabix 1 "2 November 2009" "tabix-0.1.0" "Bioinformatics tools" +.SH NAME +.PP +bgzip - Block compression/decompression utility +.PP +tabix - Generic indexer for TAB-delimited genome position files +.SH SYNOPSIS +.PP +.B bgzip +.RB [ \-cdh ] +.RB [ \-b +.IR virtualOffset ] +.RB [ \-s +.IR size ] +.RI [ file ] +.PP +.B tabix +.RB [ \-0 ] +.RB [ \-p +.R gff|bed|sam|vcf] +.RB [ \-s +.IR seqCol ] +.RB [ \-b +.IR begCol ] +.RB [ \-e +.IR endCol ] +.RB [ \-S +.IR lineSkip ] +.RB [ \-c +.IR metaChar ] +.I in.tab.bgz +.RI [ "region1 " [ "region2 " [ ... "]]]" + +.SH DESCRIPTION +.PP +Tabix indexes a TAB-delimited genome position file +.I in.tab.bgz +and creates an index file +.I in.tab.bgz.tbi +when +.I region +is absent from the command-line. The input data file must be position +sorted and compressed by +.B bgzip +which has a +.BR gzip (1) +like interface. After indexing, tabix is able to quickly retrieve data +lines overlapping +.I regions +specified in the format "chr:beginPos-endPos". Fast data retrieval also +works over network if URI is given as a file name and in this case the +index file will be downloaded if it is not present locally. + +.SH OPTIONS OF TABIX +.TP 10 +.BI "-p " STR +Input format for indexing. Valid values are: gff, bed, sam, vcf and +psltab. This option should not be applied together with any of +.BR \-s ", " \-b ", " \-e ", " \-c " and " \-0 ; +it is not used for data retrieval because this setting is stored in +the index file. [gff] +.TP +.BI "-s " INT +Column of sequence name. Option +.BR \-s ", " \-b ", " \-e ", " \-S ", " \-c " and " \-0 +are all stored in the index file and thus not used in data retrieval. [1] +.TP +.BI "-b " INT +Column of start chromosomal position. [4] +.TP +.BI "-e " INT +Column of end chromosomal position. [5] +.TP +.BI "-S " INT +Skip first INT lines in the data file. [0] +.TP +.BI "-c " CHAR +Skip lines started with character CHAR. [#] +.TP +.B -0 +Specify that the position in the data file is 0-based (e.g. UCSC files) +rather than 1-based. +.RE + +.SH EXAMPLE +grep -v ^"#" unsorted.gff | sort -k1,1 -k4,4n | bgzip -c > sorted.gff.gz; + +tabix -p gff sorted.gff.gz; + +tabix sorted.gff.gz chr1:10,000,000-20,000,000; + +.SH NOTES +It is straightforward to achieve overlap queries using the standard +B-tree index (with or without binning) implemented in all SQL databases, +or the R-tree index in PostgreSQL and Oracle. But there are still many +reasons to use tabix. Firstly, tabix directly works with a lot of widely +used TAB-delimited formats such as GFF/GTF and BED. We do not need to +design database schema or specialized binary formats. Data do not need +to be duplicated in different formats, either. Secondly, tabix works on +compressed data files while most SQL databases do not. The GenCode +annotation GTF can be compressed down to 4%. Thirdly, tabix is +fast. The same indexing algorithm is known to work efficiently for an +alignment with a few billion short reads. SQL databases probably cannot +easily handle data at this scale. Last but not the least, tabix supports +remote data retrieval. One can put the data file and the index at an FTP +or HTTP server, and other users or even web services will be able to get +a slice without downloading the entire file. + +.SH AUTHOR +.PP +Tabix was written by Heng Li. The BGZF library was originally +implemented by Bob Handsaker and modified by Heng Li for remote file +access and in-memory caching. + +.SH SEE ALSO +.PP +.BR samtools (1) diff --git a/tabix.h b/tabix.h new file mode 100644 index 0000000..e608c79 --- /dev/null +++ b/tabix.h @@ -0,0 +1,43 @@ +#ifndef __TABIDX_H +#define __TABIDX_H + +#include +#include "kstring.h" +#include "bgzf.h" + +#define TI_PRESET_GENERIC 0 +#define TI_PRESET_SAM 1 +#define TI_PRESET_VCF 2 + +#define TI_FLAG_UCSC 0x10000 + +typedef int (*ti_fetch_f)(int l, const char *s, void *data); + +struct __ti_index_t; +typedef struct __ti_index_t ti_index_t; + +typedef struct { + int32_t preset; + int32_t sc, bc, ec; + int32_t meta_char, line_skip; +} ti_conf_t; + +extern ti_conf_t ti_conf_gff, ti_conf_bed, ti_conf_psltbl, ti_conf_vcf, ti_conf_sam; + +#ifdef __cplusplus +extern "C" { +#endif + + int ti_readline(BGZF *fp, kstring_t *str); + + int ti_index_build(const char *fn, const ti_conf_t *conf); + ti_index_t *ti_index_load(const char *fn); + void ti_index_destroy(ti_index_t *idx); + int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int *end); + int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/tabix.txt b/tabix.txt new file mode 100644 index 0000000..5ba263b --- /dev/null +++ b/tabix.txt @@ -0,0 +1,89 @@ +tabix(1) Bioinformatics tools tabix(1) + + + +NAME + bgzip - Block compression/decompression utility + + tabix - Generic indexer for TAB-delimited genome position files + +SYNOPSIS + bgzip [-cdh] [-b virtualOffset] [-s size] [file] + + tabix [-0] [-p gff|bed|sam|vcf] [-s seqCol] [-b begCol] [-e endCol] [-S + lineSkip] [-c metaChar] in.tab.bgz [region1 [region2 [...]]] + + +DESCRIPTION + Tabix indexes a TAB-delimited genome position file in.tab.bgz and cre- + ates an index file in.tab.bgz.tbi when region is absent from the com- + mand-line. The input data file must be position sorted and compressed + by bgzip which has a gzip(1) like interface. After indexing, tabix is + able to quickly retrieve data lines overlapping regions specified in + the format "chr:beginPos-endPos". Fast data retrieval also works over + network if URI is given as a file name and in this case the index file + will be downloaded if it is not present locally. + + +OPTIONS OF TABIX + -p STR Input format for indexing. Valid values are: gff, bed, sam, + vcf and psltab. This option should not be applied together + with any of -s, -b, -e, -c and -0; it is not used for data + retrieval because this setting is stored in the index file. + [gff] + + -s INT Column of sequence name. Option -s, -b, -e, -S, -c and -0 are + all stored in the index file and thus not used in data + retrieval. [1] + + -b INT Column of start chromosomal position. [4] + + -e INT Column of end chromosomal position. [5] + + -S INT Skip first INT lines in the data file. [0] + + -c CHAR Skip lines started with character CHAR. [#] + + -0 Specify that the position in the data file is 0-based (e.g. + UCSC files) rather than 1-based. + + +EXAMPLE + grep -v ^"#" unsorted.gff | sort -k1,1 -k4,4n | bgzip -c > + sorted.gff.gz; + + tabix -p gff sorted.gff.gz; + + tabix sorted.gff.gz chr1:10,000,000-20,000,000; + + +NOTES + It is straightforward to achieve overlap queries using the standard B- + tree index (with or without binning) implemented in all SQL databases, + or the R-tree index in PostgreSQL and Oracle. But there are still many + reasons to use tabix. Firstly, tabix directly works with a lot of + widely used TAB-delimited formats such as GFF/GTF and BED. We do not + need to design database schema or specialized binary formats. Data do + not need to be duplicated in different formats, either. Secondly, tabix + works on compressed data files while most SQL databases do not. The + GenCode annotation GTF can be compressed down to 4%. Thirdly, tabix is + fast. The same indexing algorithm is known to work efficiently for an + alignment with a few billion short reads. SQL databases probably cannot + easily handle data at this scale. Last but not the least, tabix sup- + ports remote data retrieval. One can put the data file and the index at + an FTP or HTTP server, and other users or even web services will be + able to get a slice without downloading the entire file. + + +AUTHOR + Tabix was written by Heng Li. The BGZF library was originally imple- + mented by Bob Handsaker and modified by Heng Li for remote file access + and in-memory caching. + + +SEE ALSO + samtools(1) + + + +tabix-0.1.0 2 November 2009 tabix(1)