From bc08e6c2f34c1367d0fe75dd19cedccfab881229 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Thu, 19 Aug 2010 15:49:09 +0900 Subject: [PATCH] Imported Upstream version 0.1.0 --- Makefile | 50 ++++ bam_endian.h | 42 +++ bgzf.c | 676 +++++++++++++++++++++++++++++++++++++++++++ bgzf.h | 156 ++++++++++ bgzip.c | 179 ++++++++++++ index.c | 789 +++++++++++++++++++++++++++++++++++++++++++++++++++ khash.h | 486 +++++++++++++++++++++++++++++++ knetfile.c | 632 +++++++++++++++++++++++++++++++++++++++++ knetfile.h | 75 +++++ ksort.h | 271 ++++++++++++++++++ kstring.c | 165 +++++++++++ kstring.h | 68 +++++ main.c | 93 ++++++ tabix.1 | 117 ++++++++ tabix.h | 43 +++ tabix.txt | 89 ++++++ 16 files changed, 3931 insertions(+) create mode 100644 Makefile create mode 100644 bam_endian.h create mode 100644 bgzf.c create mode 100644 bgzf.h create mode 100644 bgzip.c create mode 100644 index.c create mode 100644 khash.h create mode 100644 knetfile.c create mode 100644 knetfile.h create mode 100644 ksort.h create mode 100644 kstring.c create mode 100644 kstring.h create mode 100644 main.c create mode 100644 tabix.1 create mode 100644 tabix.h create mode 100644 tabix.txt diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3d7153d --- /dev/null +++ b/Makefile @@ -0,0 +1,50 @@ +CC= gcc +CFLAGS= -g -Wall -O2 #-m64 #-arch ppc +DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE +LOBJS= bgzf.o kstring.o knetfile.o index.o +AOBJS= main.o +PROG= tabix bgzip +INCLUDES= +SUBDIRS= . +LIBPATH= +LIBCURSES= + +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all-recur lib-recur clean-recur cleanlocal-recur install-recur: + @target=`echo $@ | sed s/-recur//`; \ + wdir=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + cd $$subdir; \ + $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ + INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ + cd $$wdir; \ + done; + +all:$(PROG) + +lib:libtabix.a + +libtabix.a:$(LOBJS) + $(AR) -cru $@ $(LOBJS) + +tabix:lib $(AOBJS) + $(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) -lz -L. -ltabix + +bgzip:bgzip.o bgzf.o knetfile.o + $(CC) $(CFLAGS) -o $@ bgzip.o bgzf.o knetfile.o -lz + +kstring.o:kstring.h +knetfile.o:knetfile.h +bgzf.o:bgzf.h knetfile.h +index.o:bgzf.h tabix.h khash.h ksort.h kstring.h +main.o:tabix.h kstring.h bgzf.h +bgzip.o:bgzf.h + +cleanlocal: + rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a + +clean:cleanlocal-recur diff --git a/bam_endian.h b/bam_endian.h new file mode 100644 index 0000000..0fc74a8 --- /dev/null +++ b/bam_endian.h @@ -0,0 +1,42 @@ +#ifndef BAM_ENDIAN_H +#define BAM_ENDIAN_H + +#include + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} +static inline void *bam_swap_endian_2p(void *x) +{ + *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); + return x; +} +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} +static inline void *bam_swap_endian_4p(void *x) +{ + *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); + return x; +} +static inline uint64_t bam_swap_endian_8(uint64_t v) +{ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} +static inline void *bam_swap_endian_8p(void *x) +{ + *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); + return x; +} + +#endif diff --git a/bgzf.c b/bgzf.c new file mode 100644 index 0000000..7a936a8 --- /dev/null +++ b/bgzf.c @@ -0,0 +1,676 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/* + 2009-06-29 by lh3: cache recent uncompressed blocks. + 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP. + 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */ + +#include +#include +#include +#include +#include +#include +#include +#include "bgzf.h" + +#include "khash.h" +typedef struct { + int size; + uint8_t *block; + int64_t end_offset; +} cache_t; +KHASH_MAP_INIT_INT64(cache, cache_t) + +#if defined(_WIN32) || defined(_MSC_VER) +#define ftello(fp) ftell(fp) +#define fseeko(fp, offset, whence) fseek(fp, offset, whence) +#else +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); +#endif + +typedef int8_t bgzf_byte_t; + +static const int DEFAULT_BLOCK_SIZE = 64 * 1024; +static const int MAX_BLOCK_SIZE = 64 * 1024; + +static const int BLOCK_HEADER_LENGTH = 18; +static const int BLOCK_FOOTER_LENGTH = 8; + +static const int GZIP_ID1 = 31; +static const int GZIP_ID2 = 139; +static const int CM_DEFLATE = 8; +static const int FLG_FEXTRA = 4; +static const int OS_UNKNOWN = 255; +static const int BGZF_ID1 = 66; // 'B' +static const int BGZF_ID2 = 67; // 'C' +static const int BGZF_LEN = 2; +static const int BGZF_XLEN = 6; // BGZF_LEN+4 + +static const int GZIP_WINDOW_BITS = -15; // no zlib header +static const int Z_DEFAULT_MEM_LEVEL = 8; + + +inline +void +packInt16(uint8_t* buffer, uint16_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; +} + +inline +int +unpackInt16(const uint8_t* buffer) +{ + return (buffer[0] | (buffer[1] << 8)); +} + +inline +void +packInt32(uint8_t* buffer, uint32_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; + buffer[2] = value >> 16; + buffer[3] = value >> 24; +} + +static inline +int +bgzf_min(int x, int y) +{ + return (x < y) ? x : y; +} + +static +void +report_error(BGZF* fp, const char* message) { + fp->error = message; +} + +static BGZF *bgzf_read_init() +{ + BGZF *fp; + fp = calloc(1, sizeof(BGZF)); + fp->uncompressed_block_size = MAX_BLOCK_SIZE; + fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->cache_size = 0; + fp->cache = kh_init(cache); + return fp; +} + +static +BGZF* +open_read(int fd) +{ +#ifdef _USE_KNETFILE + knetFile *file = knet_dopen(fd, "r"); +#else + FILE* file = fdopen(fd, "r"); +#endif + BGZF* fp; + if (file == 0) return 0; + fp = bgzf_read_init(); + fp->file_descriptor = fd; + fp->open_mode = 'r'; +#ifdef _USE_KNETFILE + fp->x.fpr = file; +#else + fp->file = file; +#endif + return fp; +} + +static +BGZF* +open_write(int fd, bool is_uncompressed) +{ + FILE* file = fdopen(fd, "w"); + BGZF* fp; + if (file == 0) return 0; + fp = malloc(sizeof(BGZF)); + fp->file_descriptor = fd; + fp->open_mode = 'w'; + fp->owned_file = 0; fp->is_uncompressed = is_uncompressed; +#ifdef _USE_KNETFILE + fp->x.fpw = file; +#else + fp->file = file; +#endif + fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE; + fp->uncompressed_block = NULL; + fp->compressed_block_size = MAX_BLOCK_SIZE; + fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->block_address = 0; + fp->block_offset = 0; + fp->block_length = 0; + fp->error = NULL; + return fp; +} + +BGZF* +bgzf_open(const char* __restrict path, const char* __restrict mode) +{ + BGZF* fp = NULL; + if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */ +#ifdef _USE_KNETFILE + knetFile *file = knet_open(path, mode); + if (file == 0) return 0; + fp = bgzf_read_init(); + fp->file_descriptor = -1; + fp->open_mode = 'r'; + fp->x.fpr = file; +#else + int fd, oflag = O_RDONLY; +#ifdef _WIN32 + oflag |= O_BINARY; +#endif + fd = open(path, oflag); + if (fd == -1) return 0; + fp = open_read(fd); +#endif + } else if (mode[0] == 'w' || mode[0] == 'W') { + int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC; +#ifdef _WIN32 + oflag |= O_BINARY; +#endif + fd = open(path, oflag, 0666); + if (fd == -1) return 0; + fp = open_write(fd, strstr(mode, "u")? 1 : 0); + } + if (fp != NULL) { + fp->owned_file = 1; + } + return fp; +} + +BGZF* +bgzf_fdopen(int fd, const char * __restrict mode) +{ + if (fd == -1) return 0; + if (mode[0] == 'r' || mode[0] == 'R') { + return open_read(fd); + } else if (mode[0] == 'w' || mode[0] == 'W') { + return open_write(fd, strstr(mode, "u")? 1 : 0); + } else { + return NULL; + } +} + +static +int +deflate_block(BGZF* fp, int block_length) +{ + // Deflate the block in fp->uncompressed_block into fp->compressed_block. + // Also adds an extra field that stores the compressed block length. + + bgzf_byte_t* buffer = fp->compressed_block; + int buffer_size = fp->compressed_block_size; + + // Init gzip header + buffer[0] = GZIP_ID1; + buffer[1] = GZIP_ID2; + buffer[2] = CM_DEFLATE; + buffer[3] = FLG_FEXTRA; + buffer[4] = 0; // mtime + buffer[5] = 0; + buffer[6] = 0; + buffer[7] = 0; + buffer[8] = 0; + buffer[9] = OS_UNKNOWN; + buffer[10] = BGZF_XLEN; + buffer[11] = 0; + buffer[12] = BGZF_ID1; + buffer[13] = BGZF_ID2; + buffer[14] = BGZF_LEN; + buffer[15] = 0; + buffer[16] = 0; // placeholder for block length + buffer[17] = 0; + + // loop to retry for blocks that do not compress enough + int input_length = block_length; + int compressed_length = 0; + while (1) { + int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION; + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->uncompressed_block; + zs.avail_in = input_length; + zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; + zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + + int status = deflateInit2(&zs, compress_level, Z_DEFLATED, + GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (status != Z_OK) { + report_error(fp, "deflate init failed"); + return -1; + } + status = deflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + deflateEnd(&zs); + if (status == Z_OK) { + // Not enough space in buffer. + // Can happen in the rare case the input doesn't compress enough. + // Reduce the amount of input until it fits. + input_length -= 1024; + if (input_length <= 0) { + // should never happen + report_error(fp, "input reduction failed"); + return -1; + } + continue; + } + report_error(fp, "deflate failed"); + return -1; + } + status = deflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "deflate end failed"); + return -1; + } + compressed_length = zs.total_out; + compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + if (compressed_length > MAX_BLOCK_SIZE) { + // should never happen + report_error(fp, "deflate overflow"); + return -1; + } + break; + } + + packInt16((uint8_t*)&buffer[16], compressed_length-1); + uint32_t crc = crc32(0L, NULL, 0L); + crc = crc32(crc, fp->uncompressed_block, input_length); + packInt32((uint8_t*)&buffer[compressed_length-8], crc); + packInt32((uint8_t*)&buffer[compressed_length-4], input_length); + + int remaining = block_length - input_length; + if (remaining > 0) { + if (remaining > input_length) { + // should never happen (check so we can use memcpy) + report_error(fp, "remainder too large"); + return -1; + } + memcpy(fp->uncompressed_block, + fp->uncompressed_block + input_length, + remaining); + } + fp->block_offset = remaining; + return compressed_length; +} + +static +int +inflate_block(BGZF* fp, int block_length) +{ + // Inflate the block in fp->compressed_block into fp->uncompressed_block + + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->compressed_block + 18; + zs.avail_in = block_length - 16; + zs.next_out = fp->uncompressed_block; + zs.avail_out = fp->uncompressed_block_size; + + int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + if (status != Z_OK) { + report_error(fp, "inflate init failed"); + return -1; + } + status = inflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + inflateEnd(&zs); + report_error(fp, "inflate failed"); + return -1; + } + status = inflateEnd(&zs); + if (status != Z_OK) { + report_error(fp, "inflate failed"); + return -1; + } + return zs.total_out; +} + +static +int +check_header(const bgzf_byte_t* header) +{ + return (header[0] == GZIP_ID1 && + header[1] == (bgzf_byte_t) GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & FLG_FEXTRA) != 0 && + unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN && + header[12] == BGZF_ID1 && + header[13] == BGZF_ID2 && + unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); +} + +static void free_cache(BGZF *fp) +{ + khint_t k; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (fp->open_mode != 'r') return; + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) free(kh_val(h, k).block); + kh_destroy(cache, h); +} + +static int load_block_from_cache(BGZF *fp, int64_t block_address) +{ + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + k = kh_get(cache, h, block_address); + if (k == kh_end(h)) return 0; + p = &kh_val(h, k); + if (fp->block_length != 0) fp->block_offset = 0; + fp->block_address = block_address; + fp->block_length = p->size; + memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE); +#ifdef _USE_KNETFILE + knet_seek(fp->x.fpr, p->end_offset, SEEK_SET); +#else + fseeko(fp->file, p->end_offset, SEEK_SET); +#endif + return p->size; +} + +static void cache_block(BGZF *fp, int size) +{ + int ret; + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (MAX_BLOCK_SIZE >= fp->cache_size) return; + if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) { + /* A better way would be to remove the oldest block in the + * cache, but here we remove a random one for simplicity. This + * should not have a big impact on performance. */ + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) break; + if (k < kh_end(h)) { + free(kh_val(h, k).block); + kh_del(cache, h, k); + } + } + k = kh_put(cache, h, fp->block_address, &ret); + if (ret == 0) return; // if this happens, a bug! + p = &kh_val(h, k); + p->size = fp->block_length; + p->end_offset = fp->block_address + size; + p->block = malloc(MAX_BLOCK_SIZE); + memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); +} + +int +bgzf_read_block(BGZF* fp) +{ + bgzf_byte_t header[BLOCK_HEADER_LENGTH]; + int size = 0; +#ifdef _USE_KNETFILE + int64_t block_address = knet_tell(fp->x.fpr); + if (load_block_from_cache(fp, block_address)) return 0; + int count = knet_read(fp->x.fpr, header, sizeof(header)); +#else + int64_t block_address = ftello(fp->file); + if (load_block_from_cache(fp, block_address)) return 0; + int count = fread(header, 1, sizeof(header), fp->file); +#endif + if (count == 0) { + fp->block_length = 0; + return 0; + } + size = count; + if (count != sizeof(header)) { + report_error(fp, "read failed"); + return -1; + } + if (!check_header(header)) { + report_error(fp, "invalid block header"); + return -1; + } + int block_length = unpackInt16((uint8_t*)&header[16]) + 1; + bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block; + memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); + int remaining = block_length - BLOCK_HEADER_LENGTH; +#ifdef _USE_KNETFILE + count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); +#else + count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file); +#endif + if (count != remaining) { + report_error(fp, "read failed"); + return -1; + } + size += count; + count = inflate_block(fp, block_length); + if (count < 0) { + return -1; + } + if (fp->block_length != 0) { + // Do not reset offset if this read follows a seek. + fp->block_offset = 0; + } + fp->block_address = block_address; + fp->block_length = count; + cache_block(fp, size); + return 0; +} + +int +bgzf_read(BGZF* fp, void* data, int length) +{ + if (length <= 0) { + return 0; + } + if (fp->open_mode != 'r') { + report_error(fp, "file not open for reading"); + return -1; + } + + int bytes_read = 0; + bgzf_byte_t* output = data; + while (bytes_read < length) { + int available = fp->block_length - fp->block_offset; + if (available <= 0) { + if (bgzf_read_block(fp) != 0) { + return -1; + } + available = fp->block_length - fp->block_offset; + if (available <= 0) { + break; + } + } + int copy_length = bgzf_min(length-bytes_read, available); + bgzf_byte_t* buffer = fp->uncompressed_block; + memcpy(output, buffer + fp->block_offset, copy_length); + fp->block_offset += copy_length; + output += copy_length; + bytes_read += copy_length; + } + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return bytes_read; +} + +static +int +flush_block(BGZF* fp) +{ + while (fp->block_offset > 0) { + int block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) { + return -1; + } +#ifdef _USE_KNETFILE + int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); +#else + int count = fwrite(fp->compressed_block, 1, block_length, fp->file); +#endif + if (count != block_length) { + report_error(fp, "write failed"); + return -1; + } + fp->block_address += block_length; + } + return 0; +} + +int +bgzf_write(BGZF* fp, const void* data, int length) +{ + if (fp->open_mode != 'w') { + report_error(fp, "file not open for writing"); + return -1; + } + + if (fp->uncompressed_block == NULL) { + fp->uncompressed_block = malloc(fp->uncompressed_block_size); + } + + const bgzf_byte_t* input = data; + int block_length = fp->uncompressed_block_size; + int bytes_written = 0; + while (bytes_written < length) { + int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written); + bgzf_byte_t* buffer = fp->uncompressed_block; + memcpy(buffer + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; + input += copy_length; + bytes_written += copy_length; + if (fp->block_offset == block_length) { + if (flush_block(fp) != 0) { + break; + } + } + } + return bytes_written; +} + +int +bgzf_close(BGZF* fp) +{ + if (fp->open_mode == 'w') { + if (flush_block(fp) != 0) { + return -1; + } + { // add an empty block + int count, block_length = deflate_block(fp, 0); +#ifdef _USE_KNETFILE + count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); +#else + count = fwrite(fp->compressed_block, 1, block_length, fp->file); +#endif + } +#ifdef _USE_KNETFILE + if (fflush(fp->x.fpw) != 0) { +#else + if (fflush(fp->file) != 0) { +#endif + report_error(fp, "flush failed"); + return -1; + } + } + if (fp->owned_file) { +#ifdef _USE_KNETFILE + int ret; + if (fp->open_mode == 'w') ret = fclose(fp->x.fpw); + else ret = knet_close(fp->x.fpr); + if (ret != 0) return -1; +#else + if (fclose(fp->file) != 0) { + return -1; + } +#endif + } + free(fp->uncompressed_block); + free(fp->compressed_block); + free_cache(fp); + free(fp); + return 0; +} + +void bgzf_set_cache_size(BGZF *fp, int cache_size) +{ + if (fp) fp->cache_size = cache_size; +} + +int bgzf_check_EOF(BGZF *fp) +{ + static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; + uint8_t buf[28]; + off_t offset; +#ifdef _USE_KNETFILE + offset = knet_tell(fp->x.fpr); + if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1; + knet_read(fp->x.fpr, buf, 28); + knet_seek(fp->x.fpr, offset, SEEK_SET); +#else + offset = ftello(fp->file); + if (fseeko(fp->file, -28, SEEK_END) != 0) return -1; + fread(buf, 1, 28, fp->file); + fseeko(fp->file, offset, SEEK_SET); +#endif + return (memcmp(magic, buf, 28) == 0)? 1 : 0; +} + +int64_t +bgzf_seek(BGZF* fp, int64_t pos, int where) +{ + if (fp->open_mode != 'r') { + report_error(fp, "file not open for read"); + return -1; + } + if (where != SEEK_SET) { + report_error(fp, "unimplemented seek option"); + return -1; + } + int block_offset = pos & 0xFFFF; + int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; +#ifdef _USE_KNETFILE + if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) { +#else + if (fseeko(fp->file, block_address, SEEK_SET) != 0) { +#endif + report_error(fp, "seek failed"); + return -1; + } + fp->block_length = 0; // indicates current block is not loaded + fp->block_address = block_address; + fp->block_offset = block_offset; + return 0; +} diff --git a/bgzf.h b/bgzf.h new file mode 100644 index 0000000..f544a67 --- /dev/null +++ b/bgzf.h @@ -0,0 +1,156 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#ifndef __BGZF_H +#define __BGZF_H + +#include +#include +#include +#include +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +//typedef int8_t bool; + +typedef struct { + int file_descriptor; + char open_mode; // 'r' or 'w' + bool owned_file, is_uncompressed; +#ifdef _USE_KNETFILE + union { + knetFile *fpr; + FILE *fpw; + } x; +#else + FILE* file; +#endif + int uncompressed_block_size; + int compressed_block_size; + void* uncompressed_block; + void* compressed_block; + int64_t block_address; + int block_length; + int block_offset; + int cache_size; + const char* error; + void *cache; // a pointer to a hash table +} BGZF; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Open an existing file descriptor for reading or writing. + * Mode must be either "r" or "w". + * A subsequent bgzf_close will not close the file descriptor. + * Returns null on error. + */ +BGZF* bgzf_fdopen(int fd, const char* __restrict mode); + +/* + * Open the specified file for reading or writing. + * Mode must be either "r" or "w". + * Returns null on error. + */ +BGZF* bgzf_open(const char* path, const char* __restrict mode); + +/* + * Close the BGZ file and free all associated resources. + * Does not close the underlying file descriptor if created with bgzf_fdopen. + * Returns zero on success, -1 on error. + */ +int bgzf_close(BGZF* fp); + +/* + * Read up to length bytes from the file storing into data. + * Returns the number of bytes actually read. + * Returns zero on end of file. + * Returns -1 on error. + */ +int bgzf_read(BGZF* fp, void* data, int length); + +/* + * Write length bytes from data to the file. + * Returns the number of bytes written. + * Returns -1 on error. + */ +int bgzf_write(BGZF* fp, const void* data, int length); + +/* + * Return a virtual file pointer to the current location in the file. + * No interpetation of the value should be made, other than a subsequent + * call to bgzf_seek can be used to position the file at the same point. + * Return value is non-negative on success. + * Returns -1 on error. + */ +#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) + +/* + * Set the file to read from the location specified by pos, which must + * be a value previously returned by bgzf_tell for this file (but not + * necessarily one returned by this file handle). + * The where argument must be SEEK_SET. + * Seeking on a file opened for write is not supported. + * Returns zero on success, -1 on error. + */ +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); + +/* + * Set the cache size. Zero to disable. By default, caching is + * disabled. The recommended cache size for frequent random access is + * about 8M bytes. + */ +void bgzf_set_cache_size(BGZF *fp, int cache_size); + +int bgzf_check_EOF(BGZF *fp); + +int bgzf_read_block(BGZF* fp); + +#ifdef __cplusplus +} +#endif + +static inline int bgzf_getc(BGZF *fp) +{ + int c; + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) return -2; /* error */ + if (fp->block_length == 0) return -1; /* end-of-file */ + } + c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; + if (fp->block_offset == fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + return c; +} + +#endif diff --git a/bgzip.c b/bgzip.c new file mode 100644 index 0000000..381ebc5 --- /dev/null +++ b/bgzip.c @@ -0,0 +1,179 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include "bgzf.h" + +static const int WINDOW_SIZE = 64 * 1024; + +static int is_ready(int fd) +{ + fd_set fdset; + struct timeval timeout; + FD_ZERO(&fdset); + FD_SET(fd, &fdset); + timeout.tv_sec = 0; timeout.tv_usec = 100000; + return select(1, &fdset, NULL, NULL, &timeout) == 1 ? 1 : 0; +} + +static int bgzip_main_usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n"); + fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n"); + fprintf(stderr, " -d decompress\n"); + fprintf(stderr, " -b INT decompress at virtual file pointer INT\n"); + fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n"); + fprintf(stderr, " -h give this help\n"); + fprintf(stderr, "\n"); + return 1; +} + +static int write_open(const char *fn, int is_forced) +{ + int fd = -1; + char c; + if (!is_forced) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) { + fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn); + scanf("%c", &c); + if (c != 'Y' && c != 'y') { + fprintf(stderr, "[bgzip] not overwritten\n"); + exit(1); + } + } + } + if (fd < 0) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { + fprintf(stderr, "[bgzip] %s: Fail to write\n", fn); + exit(1); + } + } + return fd; +} + +static void fail(BGZF* fp) +{ + fprintf(stderr, "Error: %s\n", fp->error); + exit(1); +} + +int main(int argc, char **argv) +{ + int c, compress, pstdout, is_forced; + BGZF *fp; + void *buffer; + long start, end, size; + + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; + while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){ + switch(c){ + case 'h': return bgzip_main_usage(); + case 'd': compress = 0; break; + case 'c': pstdout = 1; break; + case 'b': start = atol(optarg); break; + case 's': size = atol(optarg); break; + case 'f': is_forced = 1; break; + } + } + if (size >= 0) end = start + size; + if (end >= 0 && end < start) { + fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); + return 1; + } + if (compress == 1) { + int f_src, f_dst = -1; + if (is_ready(fileno(stdin))) pstdout = 1; + if (argc > optind && !pstdout) { + if ((f_src = open(argv[optind], O_RDONLY)) < 0) { + fprintf(stderr, "[bgzip] Cannot open file: %s\n", argv[optind]); + return 1; + } + if (pstdout) { + f_dst = fileno(stdout); + } else { + char *name = malloc(sizeof(strlen(argv[optind]) + 5)); + strcpy(name, argv[optind]); + strcat(name, ".gz"); + f_dst = write_open(name, is_forced); + if (f_dst < 0) return 1; + free(name); + } + } else if (pstdout) { + f_src = fileno(stdin); + f_dst = fileno(stdout); + } else return bgzip_main_usage(); + fp = bgzf_fdopen(f_dst, "w"); + buffer = malloc(WINDOW_SIZE); + while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) + if (bgzf_write(fp, buffer, c) < 0) fail(fp); + // f_dst will be closed here + if (bgzf_close(fp) < 0) fail(fp); + if (argc > optind) unlink(argv[optind]); + free(buffer); + close(f_src); + return 0; + } else { + int f_dst, is_stdin = 0; + if (argc == optind) pstdout = 1; + if (is_ready(fileno(stdin))) is_stdin = 1; + if (argc <= optind && !is_stdin) return bgzip_main_usage(); + if (argc > optind && !pstdout) { + char *name; + if (strstr(argv[optind], ".gz") - argv[optind] != strlen(argv[optind]) - 3) { + fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); + return 1; + } + name = strdup(argv[optind]); + name[strlen(name) - 3] = '\0'; + f_dst = write_open(name, is_forced); + free(name); + } else f_dst = fileno(stdout); + fp = (argc == optind)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[optind], "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); + return 1; + } + buffer = malloc(WINDOW_SIZE); + if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp); + while (1) { + if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); + else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); + if (c == 0) break; + if (c < 0) fail(fp); + start += c; + write(f_dst, buffer, c); + if (end >= 0 && start >= end) break; + } + free(buffer); + if (bgzf_close(fp) < 0) fail(fp); + if (!pstdout) unlink(argv[optind]); + return 0; + } +} diff --git a/index.c b/index.c new file mode 100644 index 0000000..cc56498 --- /dev/null +++ b/index.c @@ -0,0 +1,789 @@ +#include +#include +#include "khash.h" +#include "ksort.h" +#include "kstring.h" +#include "bam_endian.h" +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif +#include "tabix.h" + +#define TAD_MIN_CHUNK_GAP 32768 +// 1<<14 is the size of minimum bin. +#define TAD_LIDX_SHIFT 14 + +typedef struct { + uint64_t u, v; +} pair64_t; + +#define pair64_lt(a,b) ((a).u < (b).u) +KSORT_INIT(off, pair64_t, pair64_lt) + +typedef struct { + uint32_t m, n; + pair64_t *list; +} ti_binlist_t; + +typedef struct { + int32_t n, m; + uint64_t *offset; +} ti_lidx_t; + +KHASH_MAP_INIT_INT(i, ti_binlist_t) +KHASH_MAP_INIT_STR(s, int) + +struct __ti_index_t { + ti_conf_t conf; + int32_t n, max; + khash_t(s) *tname; + khash_t(i) **index; + ti_lidx_t *index2; +}; + +typedef struct { + int tid, beg, end, bin; +} ti_intv_t; + +ti_conf_t ti_conf_gff = { 0, 1, 4, 5, '#', 0 }; +ti_conf_t ti_conf_bed = { TI_FLAG_UCSC, 1, 2, 3, '#', 0 }; +ti_conf_t ti_conf_psltbl = { TI_FLAG_UCSC, 15, 17, 18, '#', 0 }; +ti_conf_t ti_conf_sam = { TI_PRESET_SAM, 3, 4, 0, '@', 0 }; +ti_conf_t ti_conf_vcf = { TI_PRESET_VCF, 1, 2, 0, '#', 0 }; + +/*************** + * read a line * + ***************/ + +/* +int ti_readline(BGZF *fp, kstring_t *str) +{ + int c, l = 0; + str->l = 0; + while ((c = bgzf_getc(fp)) >= 0 && c != '\n') { + ++l; + if (c != '\r') kputc(c, str); + } + if (c < 0 && l == 0) return -1; // end of file + return str->l; +} +*/ + +/* Below is a faster implementation largely equivalent to the one + * commented out above. */ +int ti_readline(BGZF *fp, kstring_t *str) +{ + int l, state = 0; + unsigned char *buf = (unsigned char*)fp->uncompressed_block; + str->l = 0; + do { + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) { state = -2; break; } + if (fp->block_length == 0) { state = -1; break; } + } + for (l = fp->block_offset; l < fp->block_length && buf[l] != '\n'; ++l); + if (l < fp->block_length) state = 1; + l -= fp->block_offset; + if (str->l + l + 1 >= str->m) { + str->m = str->l + l + 2; + kroundup32(str->m); + str->s = (char*)realloc(str->s, str->m); + } + memcpy(str->s + str->l, buf + fp->block_offset, l); + str->l += l; + fp->block_offset += l + 1; + if (fp->block_offset >= fp->block_length) { +#ifdef _USE_KNETFILE + fp->block_address = knet_tell(fp->x.fpr); +#else + fp->block_address = ftello(fp->file); +#endif + fp->block_offset = 0; + fp->block_length = 0; + } + } while (state == 0); + if (str->l == 0 && state < 0) return state; + str->s[str->l] = 0; + return str->l; +} + +/************************************* + * get the interval from a data line * + *************************************/ + +static inline int ti_reg2bin(uint32_t beg, uint32_t end) +{ + --end; + if (beg>>14 == end>>14) return 4681 + (beg>>14); + if (beg>>17 == end>>17) return 585 + (beg>>17); + if (beg>>20 == end>>20) return 73 + (beg>>20); + if (beg>>23 == end>>23) return 9 + (beg>>23); + if (beg>>26 == end>>26) return 1 + (beg>>26); + return 0; +} + +static int get_tid(ti_index_t *idx, const char *ss) +{ + khint_t k; + int tid; + k = kh_get(s, idx->tname, ss); + if (k == kh_end(idx->tname)) { // a new target sequence + int ret, size; + // update idx->n, ->max, ->index and ->index2 + if (idx->n == idx->max) { + idx->max = idx->max? idx->max<<1 : 8; + idx->index = realloc(idx->index, idx->max * sizeof(void*)); + idx->index2 = realloc(idx->index2, idx->max * sizeof(ti_lidx_t)); + } + memset(&idx->index2[idx->n], 0, sizeof(ti_lidx_t)); + idx->index[idx->n++] = kh_init(i); + // update ->tname + tid = size = kh_size(idx->tname); + k = kh_put(s, idx->tname, strdup(ss), &ret); + kh_value(idx->tname, k) = size; + assert(idx->n == kh_size(idx->tname)); + } else tid = kh_value(idx->tname, k); + return tid; +} + +static int get_intv(ti_index_t *idx, kstring_t *str, ti_intv_t *intv) +{ + int i, b = 0, id = 1; + char *s; + intv->tid = intv->beg = intv->end = intv->bin = -1; + for (i = 0; i <= str->l; ++i) { + if (str->s[i] == '\t' || str->s[i] == 0) { + if (id == idx->conf.sc) { + str->s[i] = 0; + intv->tid = get_tid(idx, str->s + b); + if (i != str->l) str->s[i] = '\t'; + } else if (id == idx->conf.bc) { + // here ->beg is 1-based. it will be changed to 0-based at the end of this routine. + intv->beg = strtol(str->s + b, &s, 0); + if (idx->conf.preset&TI_FLAG_UCSC) ++intv->beg; + } else { + if ((idx->conf.preset&0xffff) == TI_PRESET_GENERIC) { + if (id == idx->conf.ec) intv->end = strtol(str->s + b, &s, 0); + } else if ((idx->conf.preset&0xffff) == TI_PRESET_SAM) { + if (id == 6) { // CIGAR + int l = 0, op; + char *t; + for (s = str->s + b; s < str->s + i;) { + long x = strtol(s, &t, 10); + op = toupper(*t); + if (op == 'M' || op == 'D' || op == 'N') l += x; + s = t + 1; + } + intv->end = intv->beg + l - 1; + } + } else if ((idx->conf.preset&0xffff) == TI_PRESET_VCF) { + // FIXME: the following is NOT tested and is likely to be buggy + if (id == 5) { // ALT + char *t; + int max = 1; + for (s = str->s + b; s < str->s + i;) { + if (s[i] == 'D') { + long x = strtol(s + 1, &t, 10); + if (x > max) max = x; + s = t + 1; + } else ++s; + } + intv->end = intv->beg + max - 1; + } + } + } + b = i + 1; + ++id; + } + } + if (intv->tid < 0 || intv->beg < 0 || intv->end < 0) return -1; + intv->bin = ti_reg2bin(intv->beg-1, intv->end); + return 0; +} + +/************ + * indexing * + ************/ + +// requirement: len <= LEN_MASK +static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end) +{ + khint_t k; + ti_binlist_t *l; + int ret; + k = kh_put(i, h, bin, &ret); + l = &kh_value(h, k); + if (ret) { // not present + l->m = 1; l->n = 0; + l->list = (pair64_t*)calloc(l->m, 16); + } + if (l->n == l->m) { + l->m <<= 1; + l->list = (pair64_t*)realloc(l->list, l->m * 16); + } + l->list[l->n].u = beg; l->list[l->n++].v = end; +} + +static inline void insert_offset2(ti_lidx_t *index2, int _beg, int _end, uint64_t offset) +{ + int i, beg, end; + beg = _beg >> TAD_LIDX_SHIFT; + end = (_end - 1) >> TAD_LIDX_SHIFT; + if (index2->m < end + 1) { + int old_m = index2->m; + index2->m = end + 1; + kroundup32(index2->m); + index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); + memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); + } + for (i = beg + 1; i <= end; ++i) + if (index2->offset[i] == 0) index2->offset[i] = offset; + index2->n = end + 1; +} + +static void merge_chunks(ti_index_t *idx) +{ + khash_t(i) *index; + int i, l, m; + khint_t k; + for (i = 0; i < idx->n; ++i) { + index = idx->index[i]; + for (k = kh_begin(index); k != kh_end(index); ++k) { + ti_binlist_t *p; + if (!kh_exist(index, k)) continue; + p = &kh_value(index, k); + m = 0; + for (l = 1; l < p->n; ++l) { + if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v; + else p->list[++m] = p->list[l]; + } // ~for(l) + p->n = m + 1; + } // ~for(k) + } // ~for(i) +} + +ti_index_t *ti_index_core(BGZF *fp, const ti_conf_t *conf) +{ + int ret; + ti_index_t *idx; + uint32_t last_bin, save_bin; + int32_t last_coor, last_tid, save_tid; + uint64_t save_off, last_off, lineno = 0; + kstring_t *str; + + str = calloc(1, sizeof(kstring_t)); + + idx = (ti_index_t*)calloc(1, sizeof(ti_index_t)); + idx->conf = *conf; + idx->n = idx->max = 0; + idx->tname = kh_init(s); + idx->index = 0; + idx->index2 = 0; + + save_bin = save_tid = last_tid = last_bin = 0xffffffffu; + save_off = last_off = bgzf_tell(fp); last_coor = 0xffffffffu; + while ((ret = ti_readline(fp, str)) >= 0) { + ti_intv_t intv; + ++lineno; + if (lineno <= idx->conf.line_skip || str->s[0] == idx->conf.meta_char) { + last_off = bgzf_tell(fp); + continue; + } + get_intv(idx, str, &intv); + if (last_tid != intv.tid) { // change of chromosomes + last_tid = intv.tid; + last_bin = 0xffffffffu; + } else if (last_coor > intv.beg) { + fprintf(stderr, "[ti_index_core] the file is not sorted at line %llu\n", (unsigned long long)lineno); + exit(1); + } + if (intv.bin < 4681) insert_offset2(&idx->index2[intv.tid], intv.beg, intv.end, last_off); + if (intv.bin != last_bin) { // then possibly write the binning index + if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record + insert_offset(idx->index[save_tid], save_bin, save_off, last_off); + save_off = last_off; + save_bin = last_bin = intv.bin; + save_tid = intv.tid; + if (save_tid < 0) break; + } + if (bgzf_tell(fp) <= last_off) { + fprintf(stderr, "[ti_index_core] bug in BGZF: %llx < %llx\n", + (unsigned long long)bgzf_tell(fp), (unsigned long long)last_off); + exit(1); + } + last_off = bgzf_tell(fp); + last_coor = intv.beg; + } + if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bgzf_tell(fp)); + merge_chunks(idx); + + free(str->s); free(str); + return idx; +} + +void ti_index_destroy(ti_index_t *idx) +{ + khint_t k; + int i; + if (idx == 0) return; + // destroy the name hash table + for (k = kh_begin(idx->tname); k != kh_end(idx->tname); ++k) { + if (kh_exist(idx->tname, k)) + free((char*)kh_key(idx->tname, k)); + } + kh_destroy(s, idx->tname); + // destroy the binning index + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index = idx->index[i]; + ti_lidx_t *index2 = idx->index2 + i; + for (k = kh_begin(index); k != kh_end(index); ++k) { + if (kh_exist(index, k)) + free(kh_value(index, k).list); + } + kh_destroy(i, index); + free(index2->offset); + } + free(idx->index); + // destroy the linear index + free(idx->index2); + free(idx); +} + +/****************** + * index file I/O * + ******************/ + +void ti_index_save(const ti_index_t *idx, BGZF *fp) +{ + int32_t i, size, ti_is_be; + khint_t k; + ti_is_be = bam_is_big_endian(); + bgzf_write(fp, "TBI\1", 4); + if (ti_is_be) { + uint32_t x = idx->n; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &idx->n, 4); + assert(sizeof(ti_conf_t) == 24); + if (ti_is_be) { // write ti_conf_t; + uint32_t x[6]; + memcpy(x, &idx->conf, 24); + for (i = 0; i < 6; ++i) bgzf_write(fp, bam_swap_endian_4p(&x[i]), 4); + } else bgzf_write(fp, &idx->conf, sizeof(ti_conf_t)); + { // write target names + char **name; + int32_t l = 0; + name = calloc(kh_size(idx->tname), sizeof(void*)); + for (k = kh_begin(idx->tname); k != kh_end(idx->tname); ++k) + if (kh_exist(idx->tname, k)) + name[kh_value(idx->tname, k)] = (char*)kh_key(idx->tname, k); + for (i = 0; i < kh_size(idx->tname); ++i) + l += strlen(name[i]) + 1; + if (ti_is_be) bgzf_write(fp, bam_swap_endian_4p(&l), 4); + else bgzf_write(fp, &l, 4); + for (i = 0; i < kh_size(idx->tname); ++i) + bgzf_write(fp, name[i], strlen(name[i]) + 1); + free(name); + } + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index = idx->index[i]; + ti_lidx_t *index2 = idx->index2 + i; + // write binning index + size = kh_size(index); + if (ti_is_be) { // big endian + uint32_t x = size; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &size, 4); + for (k = kh_begin(index); k != kh_end(index); ++k) { + if (kh_exist(index, k)) { + ti_binlist_t *p = &kh_value(index, k); + if (ti_is_be) { // big endian + uint32_t x; + x = kh_key(index, k); bgzf_write(fp, bam_swap_endian_4p(&x), 4); + x = p->n; bgzf_write(fp, bam_swap_endian_4p(&x), 4); + for (x = 0; (int)x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + bgzf_write(fp, p->list, 16 * p->n); + for (x = 0; (int)x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + } else { + bgzf_write(fp, &kh_key(index, k), 4); + bgzf_write(fp, &p->n, 4); + bgzf_write(fp, p->list, 16 * p->n); + } + } + } + // write linear index (index2) + if (ti_is_be) { + int x = index2->n; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &index2->n, 4); + if (ti_is_be) { // big endian + int x; + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + bgzf_write(fp, index2->offset, 8 * index2->n); + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + } else bgzf_write(fp, index2->offset, 8 * index2->n); + } +} + +static ti_index_t *ti_index_load_core(BGZF *fp) +{ + int i, ti_is_be; + char magic[4]; + ti_index_t *idx; + ti_is_be = bam_is_big_endian(); + if (fp == 0) { + fprintf(stderr, "[ti_index_load_core] fail to load index.\n"); + return 0; + } + bgzf_read(fp, magic, 4); + if (strncmp(magic, "TBI\1", 4)) { + fprintf(stderr, "[ti_index_load] wrong magic number.\n"); + return 0; + } + idx = (ti_index_t*)calloc(1, sizeof(ti_index_t)); + bgzf_read(fp, &idx->n, 4); + if (ti_is_be) bam_swap_endian_4p(&idx->n); + idx->tname = kh_init(s); + idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); + idx->index2 = (ti_lidx_t*)calloc(idx->n, sizeof(ti_lidx_t)); + // read idx->conf + bgzf_read(fp, &idx->conf, sizeof(ti_conf_t)); + if (ti_is_be) { + bam_swap_endian_4p(&idx->conf.preset); + bam_swap_endian_4p(&idx->conf.sc); + bam_swap_endian_4p(&idx->conf.bc); + bam_swap_endian_4p(&idx->conf.ec); + bam_swap_endian_4p(&idx->conf.meta_char); + bam_swap_endian_4p(&idx->conf.line_skip); + } + { // read target names + int j, ret; + kstring_t *str; + int32_t l; + uint8_t *buf; + bgzf_read(fp, &l, 4); + if (ti_is_be) bam_swap_endian_4p(&l); + buf = calloc(l, 1); + bgzf_read(fp, buf, l); + str = calloc(1, sizeof(kstring_t)); + for (i = j = 0; i < l; ++i) { + if (buf[i] == 0) { + khint_t k = kh_put(s, idx->tname, strdup(str->s), &ret); + kh_value(idx->tname, k) = j++; + str->l = 0; + } else kputc(buf[i], str); + } + free(str->s); free(str); free(buf); + } + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index; + ti_lidx_t *index2 = idx->index2 + i; + uint32_t key, size; + khint_t k; + int j, ret; + ti_binlist_t *p; + index = idx->index[i] = kh_init(i); + // load binning index + bgzf_read(fp, &size, 4); + if (ti_is_be) bam_swap_endian_4p(&size); + for (j = 0; j < (int)size; ++j) { + bgzf_read(fp, &key, 4); + if (ti_is_be) bam_swap_endian_4p(&key); + k = kh_put(i, index, key, &ret); + p = &kh_value(index, k); + bgzf_read(fp, &p->n, 4); + if (ti_is_be) bam_swap_endian_4p(&p->n); + p->m = p->n; + p->list = (pair64_t*)malloc(p->m * 16); + bgzf_read(fp, p->list, 16 * p->n); + if (ti_is_be) { + int x; + for (x = 0; x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + } + } + // load linear index + bgzf_read(fp, &index2->n, 4); + if (ti_is_be) bam_swap_endian_4p(&index2->n); + index2->m = index2->n; + index2->offset = (uint64_t*)calloc(index2->m, 8); + bgzf_read(fp, index2->offset, index2->n * 8); + if (ti_is_be) + for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); + } + return idx; +} + +ti_index_t *ti_index_load_local(const char *_fn) +{ + BGZF *fp; + char *fnidx, *fn; + + if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) { + const char *p; + int l = strlen(_fn); + for (p = _fn + l - 1; p >= _fn; --p) + if (*p == '/') break; + fn = strdup(p + 1); + } else fn = strdup(_fn); + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".tbi"); + fp = bgzf_open(fnidx, "r"); + free(fnidx); free(fn); + if (fp) { + ti_index_t *idx = ti_index_load_core(fp); + bgzf_close(fp); + return idx; + } else return 0; +} + +#ifdef _USE_KNETFILE +static void download_from_remote(const char *url) +{ + const int buf_size = 1 * 1024 * 1024; + char *fn; + FILE *fp; + uint8_t *buf; + knetFile *fp_remote; + int l; + if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return; + l = strlen(url); + for (fn = (char*)url + l - 1; fn >= url; --fn) + if (*fn == '/') break; + ++fn; // fn now points to the file name + fp_remote = knet_open(url, "r"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); + return; + } + if ((fp = fopen(fn, "w")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); + knet_close(fp_remote); + return; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + knet_close(fp_remote); +} +#else +static void download_from_remote(const char *url) +{ + return; +} +#endif + +ti_index_t *ti_index_load(const char *fn) +{ + ti_index_t *idx; + idx = ti_index_load_local(fn); + if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) { + char *fnidx = calloc(strlen(fn) + 5, 1); + strcat(strcpy(fnidx, fn), ".tbi"); + fprintf(stderr, "[ti_index_load] attempting to download the remote index file.\n"); + download_from_remote(fnidx); + idx = ti_index_load_local(fn); + } + if (idx == 0) fprintf(stderr, "[ti_index_load] fail to load BAM index.\n"); + return idx; +} + +int ti_index_build2(const char *fn, const ti_conf_t *conf, const char *_fnidx) +{ + char *fnidx; + BGZF *fp, *fpidx; + ti_index_t *idx; + if ((fp = bgzf_open(fn, "r")) == 0) { + fprintf(stderr, "[ti_index_build2] fail to open the BAM file.\n"); + return -1; + } + idx = ti_index_core(fp, conf); + bgzf_close(fp); + if (_fnidx == 0) { + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".tbi"); + } else fnidx = strdup(_fnidx); + fpidx = bgzf_open(fnidx, "w"); + if (fpidx == 0) { + fprintf(stderr, "[ti_index_build2] fail to create the index file.\n"); + free(fnidx); + return -1; + } + ti_index_save(idx, fpidx); + ti_index_destroy(idx); + bgzf_close(fpidx); + free(fnidx); + return 0; +} + +int ti_index_build(const char *fn, const ti_conf_t *conf) +{ + return ti_index_build2(fn, conf, 0); +} + +/******************************************** + * parse a region in the format chr:beg-end * + ********************************************/ + +int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int *end) +{ + char *s, *p; + int i, l, k; + khiter_t iter; + khash_t(s) *h = idx->tname; + l = strlen(str); + p = s = (char*)malloc(l+1); + /* squeeze out "," */ + for (i = k = 0; i != l; ++i) + if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; + s[k] = 0; + for (i = 0; i != k; ++i) if (s[i] == ':') break; + s[i] = 0; + iter = kh_get(s, h, s); /* get the tid */ + if (iter == kh_end(h)) { // name not found + *tid = -1; free(s); + return -1; + } + *tid = kh_value(h, iter); + if (i == k) { /* dump the whole sequence */ + *begin = 0; *end = 1<<29; free(s); + return 0; + } + for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; + *begin = atoi(p); + if (i < k) { + p = s + i + 1; + *end = atoi(p); + } else *end = 1<<29; + if (*begin > 0) --*begin; + free(s); + if (*begin > *end) return -1; + return 0; +} + +/******************************* + * retrieve a specified region * + *******************************/ + +#define MAX_BIN 37450 // =(8^6-1)/7+1 + +static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN]) +{ + int i = 0, k; + --end; + list[i++] = 0; + for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; + for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k; + for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k; + for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k; + for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k; + return i; +} + +static inline int is_overlap(int32_t beg, int32_t end, int rbeg, int rend) +{ + return (rend > beg && rbeg < end); +} + +// ti_fetch helper function retrieves +pair64_t *get_chunk_coordinates(const ti_index_t *idx, int tid, int beg, int end, int* cnt_off) +{ + uint16_t *bins; + int i, n_bins, n_off; + pair64_t *off; + khint_t k; + khash_t(i) *index; + uint64_t min_off; + + bins = (uint16_t*)calloc(MAX_BIN, 2); + n_bins = reg2bins(beg, end, bins); + index = idx->index[tid]; + min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT]; + for (i = n_off = 0; i < n_bins; ++i) { + if ((k = kh_get(i, index, bins[i])) != kh_end(index)) + n_off += kh_value(index, k).n; + } + if (n_off == 0) { + free(bins); return 0; + } + off = (pair64_t*)calloc(n_off, 16); + for (i = n_off = 0; i < n_bins; ++i) { + if ((k = kh_get(i, index, bins[i])) != kh_end(index)) { + int j; + ti_binlist_t *p = &kh_value(index, k); + for (j = 0; j < p->n; ++j) + if (p->list[j].v > min_off) off[n_off++] = p->list[j]; + } + } + free(bins); + { + int l; + ks_introsort(off, n_off, off); + // resolve completely contained adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) + if (off[l].v < off[i].v) + off[++l] = off[i]; + n_off = l + 1; + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + for (i = 1; i < n_off; ++i) + if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; + { // merge adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) { + if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; + else off[++l] = off[i]; + } + n_off = l + 1; + } + } + *cnt_off = n_off; + return off; +} + +int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func) +{ + int n_off; + pair64_t *off = get_chunk_coordinates(idx, tid, beg, end, &n_off); + if (off == 0) return 0; + { + // retrive alignments + uint64_t curr_off; + int i, ret, n_seeks; + kstring_t *str = calloc(1, sizeof(kstring_t)); + n_seeks = 0; i = -1; curr_off = 0; + for (;;) { + if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk + if (i == n_off - 1) break; // no more chunks + if (i >= 0) assert(curr_off == off[i].v); // otherwise bug + if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek + bgzf_seek(fp, off[i+1].u, SEEK_SET); + curr_off = bgzf_tell(fp); + ++n_seeks; + } + ++i; + } + if ((ret = ti_readline(fp, str)) >= 0) { + ti_intv_t intv; + curr_off = bgzf_tell(fp); + if (str->s[0] == idx->conf.meta_char) continue; + get_intv((ti_index_t*)idx, str, &intv); + if (intv.tid != tid || intv.beg >= end) break; // no need to proceed + else if (is_overlap(beg, end, intv.beg, intv.end)) func(str->l, str->s, data); + } else break; // end of file + } +// fprintf(stderr, "[ti_fetch] # seek calls: %d\n", n_seeks); + free(str->s); free(str); + } + free(off); + return 0; +} diff --git a/khash.h b/khash.h new file mode 100644 index 0000000..1d583ef --- /dev/null +++ b/khash.h @@ -0,0 +1,486 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + if (!ret) kh_del(32, h, k); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + + @copyright Heng Li + */ + +#define AC_VERSION_KHASH_H "0.2.2" + +#include +#include +#include + +typedef uint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_HASH_PRIME_SIZE 32 +static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = +{ + 0ul, 3ul, 11ul, 23ul, 53ul, + 97ul, 193ul, 389ul, 769ul, 1543ul, + 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, + 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, + 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, + 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, + 3221225473ul, 4294967291ul +}; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +static const double __ac_HASH_UPPER = 0.77; + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + uint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + static inline kh_##name##_t *kh_init_##name() { \ + return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + } \ + static inline void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + free(h->keys); free(h->flags); \ + free(h->vals); \ + free(h); \ + } \ + } \ + static inline void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t inc, k, i, last; \ + k = __hash_func(key); i = k % h->n_buckets; \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { \ + uint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + khint_t t = __ac_HASH_PRIME_SIZE - 1; \ + while (__ac_prime_list[t] > new_n_buckets) --t; \ + new_n_buckets = __ac_prime_list[t+1]; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ + else { \ + new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ + memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ + if (h->n_buckets < new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + } \ + } \ + if (j) { \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { \ + khint_t inc, k, i; \ + k = __hash_func(key); \ + i = k % new_n_buckets; \ + inc = 1 + k % (new_n_buckets - 1); \ + while (!__ac_isempty(new_flags, i)) { \ + if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ + else i += inc; \ + } \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); \ + } else { \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + free(h->flags); \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + } \ + static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { \ + if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ + else kh_resize_##name(h, h->n_buckets + 1); \ + } \ + { \ + khint_t inc, k, i, site, last; \ + x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ + if (__ac_isempty(h->flags, i)) x = i; \ + else { \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; \ + return x; \ + } \ + static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [uint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (uint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [uint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other necessary macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/knetfile.c b/knetfile.c new file mode 100644 index 0000000..994babb --- /dev/null +++ b/knetfile.c @@ -0,0 +1,632 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* Probably I will not do socket programming in the next few years and + therefore I decide to heavily annotate this file, for Linux and + Windows as well. -lh3 */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#else +#include +#include +#include +#endif + +#include "knetfile.h" + +/* In winsock.h, the type of a socket is SOCKET, which is: "typedef + * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed + * integer -1. In knetfile.c, I use "int" for socket type + * throughout. This should be improved to avoid confusion. + * + * In Linux/Mac, recv() and read() do almost the same thing. You can see + * in the header file that netread() is simply an alias of read(). In + * Windows, however, they are different and using recv() is mandatory. + */ + +/* This function tests if the file handler is ready for reading (or + * writing if is_read==0). */ +static int socket_wait(int fd, int is_read) +{ + fd_set fds, *fdr = 0, *fdw = 0; + struct timeval tv; + int ret; + tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (is_read) fdr = &fds; + else fdw = &fds; + ret = select(fd+1, fdr, fdw, 0, &tv); +#ifndef _WIN32 + if (ret == -1) perror("select"); +#else + if (ret == 0) + fprintf(stderr, "select time-out\n"); + else if (ret == SOCKET_ERROR) + fprintf(stderr, "select: %d\n", WSAGetLastError()); +#endif + return ret; +} + +#ifndef _WIN32 +/* This function does not work with Windows due to the lack of + * getaddrinfo() in winsock. It is addapted from an example in "Beej's + * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ +static int socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) + + int on = 1, fd; + struct linger lng = { 0, 0 }; + struct addrinfo hints, *res; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + /* In Unix/Mac, getaddrinfo() is the most convenient way to get + * server information. */ + if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); + if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); + /* The following two setsockopt() are used by ftplib + * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they + * necessary. */ + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); + freeaddrinfo(res); + return fd; +} +#else +/* MinGW's printf has problem with "%lld" */ +char *int64tostr(char *buf, int64_t x) +{ + int cnt; + int i = 0; + do { + buf[i++] = '0' + x % 10; + x /= 10; + } while (x); + buf[i] = 0; + for (cnt = i, i = 0; i < cnt/2; ++i) { + int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; + } + return buf; +} + +int64_t strtoint64(const char *buf) +{ + int64_t x; + for (x = 0; *buf != '\0'; ++buf) + x = x * 10 + ((int64_t) *buf - 48); + return x; +} +/* In windows, the first thing is to establish the TCP connection. */ +int knet_win32_init() +{ + WSADATA wsaData; + return WSAStartup(MAKEWORD(2, 2), &wsaData); +} +void knet_win32_destroy() +{ + WSACleanup(); +} +/* A slightly modfied version of the following function also works on + * Mac (and presummably Linux). However, this function is not stable on + * my Mac. It sometimes works fine but sometimes does not. Therefore for + * non-Windows OS, I do not use this one. */ +static SOCKET socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) \ + do { \ + fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \ + return -1; \ + } while (0) + + int on = 1; + SOCKET fd; + struct linger lng = { 0, 0 }; + struct sockaddr_in server; + struct hostent *hp = 0; + // open socket + if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + // get host info + if (isalpha(host[0])) hp = gethostbyname(host); + else { + struct in_addr addr; + addr.s_addr = inet_addr(host); + hp = gethostbyaddr((char*)&addr, 4, AF_INET); + } + if (hp == 0) __err_connect("gethost"); + // connect + server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); + server.sin_family= AF_INET; + server.sin_port = htons(atoi(port)); + if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); + // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) + return fd; +} +#endif + +static off_t my_netread(int fd, void *buf, off_t len) +{ + off_t rest = len, curr, l = 0; + /* recv() and read() may not read the required length of data with + * one call. They have to be called repeatedly. */ + while (rest) { + if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading + curr = netread(fd, buf + l, rest); + /* According to the glibc manual, section 13.2, a zero returned + * value indicates end-of-file (EOF), which should mean that + * read() will not return zero if EOF has not been met but data + * are not immediately available. */ + if (curr == 0) break; + l += curr; rest -= curr; + } + return l; +} + +/************************* + * FTP specific routines * + *************************/ + +static int kftp_get_response(knetFile *ftp) +{ +#ifndef _WIN32 + unsigned char c; +#else + char c; +#endif + int n = 0; + char *p; + if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; + while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O + //fputc(c, stderr); + if (n >= ftp->max_response) { + ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; + ftp->response = realloc(ftp->response, ftp->max_response); + } + ftp->response[n++] = c; + if (c == '\n') { + if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2]) + && ftp->response[3] != '-') break; + n = 0; + continue; + } + } + if (n < 2) return -1; + ftp->response[n-2] = 0; + return strtol(ftp->response, &p, 0); +} + +static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) +{ + if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing + netwrite(ftp->ctrl_fd, cmd, strlen(cmd)); + return is_get? kftp_get_response(ftp) : 0; +} + +static int kftp_pasv_prep(knetFile *ftp) +{ + char *p; + int v[6]; + kftp_send_cmd(ftp, "PASV\r\n", 1); + for (p = ftp->response; *p && *p != '('; ++p); + if (*p != '(') return -1; + ++p; + sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); + memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); + ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; + return 0; +} + + +static int kftp_pasv_connect(knetFile *ftp) +{ + char host[80], port[10]; + if (ftp->pasv_port == 0) { + fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n"); + return -1; + } + sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); + sprintf(port, "%d", ftp->pasv_port); + ftp->fd = socket_connect(host, port); + if (ftp->fd == -1) return -1; + return 0; +} + +int kftp_connect(knetFile *ftp) +{ + ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); + if (ftp->ctrl_fd == -1) return -1; + kftp_get_response(ftp); + kftp_send_cmd(ftp, "USER anonymous\r\n", 1); + kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); + kftp_send_cmd(ftp, "TYPE I\r\n", 1); + return 0; +} + +int kftp_reconnect(knetFile *ftp) +{ + if (ftp->ctrl_fd != -1) { + netclose(ftp->ctrl_fd); + ftp->ctrl_fd = -1; + } + netclose(ftp->fd); + ftp->fd = -1; + return kftp_connect(ftp); +} + +// initialize ->type, ->host, ->retr and ->size +knetFile *kftp_parse_url(const char *fn, const char *mode) +{ + knetFile *fp; + char *p; + int l; + if (strstr(fn, "ftp://") != fn) return 0; + for (p = (char*)fn + 6; *p && *p != '/'; ++p); + if (*p != '/') return 0; + l = p - fn - 6; + fp = calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_FTP; + fp->fd = -1; + /* the Linux/Mac version of socket_connect() also recognizes a port + * like "ftp", but the Windows version does not. */ + fp->port = strdup("21"); + fp->host = calloc(l + 1, 1); + if (strchr(mode, 'c')) fp->no_reconnect = 1; + strncpy(fp->host, fn + 6, l); + fp->retr = calloc(strlen(p) + 8, 1); + sprintf(fp->retr, "RETR %s\r\n", p); + fp->size_cmd = calloc(strlen(p) + 8, 1); + sprintf(fp->size_cmd, "SIZE %s\r\n", p); + fp->seek_offset = 0; + return fp; +} +// place ->fd at offset off +int kftp_connect_file(knetFile *fp) +{ + int ret; + long long file_size; + if (fp->fd != -1) { + netclose(fp->fd); + if (fp->no_reconnect) kftp_get_response(fp); + } + kftp_pasv_prep(fp); + kftp_send_cmd(fp, fp->size_cmd, 1); +#ifndef _WIN32 + if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) + { + fprintf(stderr,"[kftp_connect_file] %s\n", fp->response); + return -1; + } +#else + const char *p = fp->response; + while (*p != ' ') ++p; + while (*p < '0' || *p > '9') ++p; + file_size = strtoint64(p); +#endif + fp->file_size = file_size; + if (fp->offset>=0) { + char tmp[32]; +#ifndef _WIN32 + sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); +#else + strcpy(tmp, "REST "); + int64tostr(tmp + 5, fp->offset); + strcat(tmp, "\r\n"); +#endif + kftp_send_cmd(fp, tmp, 1); + } + kftp_send_cmd(fp, fp->retr, 0); + kftp_pasv_connect(fp); + ret = kftp_get_response(fp); + if (ret != 150) { + fprintf(stderr, "[kftp_connect_file] %s\n", fp->response); + netclose(fp->fd); + fp->fd = -1; + return -1; + } + fp->is_ready = 1; + return 0; +} + + +/************************** + * HTTP specific routines * + **************************/ + +knetFile *khttp_parse_url(const char *fn, const char *mode) +{ + knetFile *fp; + char *p, *proxy, *q; + int l; + if (strstr(fn, "http://") != fn) return 0; + // set ->http_host + for (p = (char*)fn + 7; *p && *p != '/'; ++p); + l = p - fn - 7; + fp = calloc(1, sizeof(knetFile)); + fp->http_host = calloc(l + 1, 1); + strncpy(fp->http_host, fn + 7, l); + fp->http_host[l] = 0; + for (q = fp->http_host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + // get http_proxy + proxy = getenv("http_proxy"); + // set ->host, ->port and ->path + if (proxy == 0) { + fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. + fp->port = strdup(*q? q : "80"); + fp->path = strdup(*p? p : "/"); + } else { + fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); + for (q = fp->host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + fp->port = strdup(*q? q : "80"); + fp->path = strdup(fn); + } + fp->type = KNF_TYPE_HTTP; + fp->ctrl_fd = fp->fd = -1; + fp->seek_offset = 0; + return fp; +} + +int khttp_connect_file(knetFile *fp) +{ + int ret, l = 0; + char *buf, *p; + if (fp->fd != -1) netclose(fp->fd); + fp->fd = socket_connect(fp->host, fp->port); + buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. + l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); + l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); + l += sprintf(buf + l, "\r\n"); + netwrite(fp->fd, buf, l); + l = 0; + while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency + if (buf[l] == '\n' && l >= 3) + if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; + ++l; + } + buf[l] = 0; + if (l < 14) { // prematured header + netclose(fp->fd); + fp->fd = -1; + return -1; + } + ret = strtol(buf + 8, &p, 0); // HTTP return code + if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file + off_t rest = fp->offset; + while (rest) { + off_t l = rest < 0x10000? rest : 0x10000; + rest -= my_netread(fp->fd, buf, l); + } + } else if (ret != 206 && ret != 200) { + free(buf); + fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret); + netclose(fp->fd); + fp->fd = -1; + return -1; + } + free(buf); + fp->is_ready = 1; + return 0; +} + +/******************** + * Generic routines * + ********************/ + +knetFile *knet_open(const char *fn, const char *mode) +{ + knetFile *fp = 0; + if (mode[0] != 'r') { + fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); + return 0; + } + if (strstr(fn, "ftp://") == fn) { + fp = kftp_parse_url(fn, mode); + if (fp == 0) return 0; + if (kftp_connect(fp) == -1) { + knet_close(fp); + return 0; + } + kftp_connect_file(fp); + } else if (strstr(fn, "http://") == fn) { + fp = khttp_parse_url(fn, mode); + if (fp == 0) return 0; + khttp_connect_file(fp); + } else { // local file +#ifdef _WIN32 + /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may + * be undefined on some systems, although it is defined on my + * Mac and the Linux I have tested on. */ + int fd = open(fn, O_RDONLY | O_BINARY); +#else + int fd = open(fn, O_RDONLY); +#endif + if (fd == -1) { + perror("open"); + return 0; + } + fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + fp->ctrl_fd = -1; + } + if (fp && fp->fd == -1) { + knet_close(fp); + return 0; + } + return fp; +} + +knetFile *knet_dopen(int fd, const char *mode) +{ + knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + return fp; +} + +off_t knet_read(knetFile *fp, void *buf, off_t len) +{ + off_t l = 0; + if (fp->fd == -1) return 0; + if (fp->type == KNF_TYPE_FTP) { + if (fp->is_ready == 0) { + if (!fp->no_reconnect) kftp_reconnect(fp); + kftp_connect_file(fp); + } + } else if (fp->type == KNF_TYPE_HTTP) { + if (fp->is_ready == 0) + khttp_connect_file(fp); + } + if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX + off_t rest = len, curr; + while (rest) { + curr = read(fp->fd, buf + l, rest); + if (curr == 0) break; + l += curr; rest -= curr; + } + } else l = my_netread(fp->fd, buf, len); + fp->offset += l; + return l; +} + +off_t knet_seek(knetFile *fp, int64_t off, int whence) +{ + if (whence == SEEK_SET && off == fp->offset) return 0; + if (fp->type == KNF_TYPE_LOCAL) { + /* Be aware that lseek() returns the offset after seeking, + * while fseek() returns zero on success. */ + off_t offset = lseek(fp->fd, off, whence); + if (offset == -1) { + // Be silent, it is OK for knet_seek to fail when the file is streamed + // fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); + return -1; + } + fp->offset = offset; + return 0; + } + else if (fp->type == KNF_TYPE_FTP) + { + if (whence==SEEK_CUR) + fp->offset += off; + else if (whence==SEEK_SET) + fp->offset = off; + else if ( whence==SEEK_END) + fp->offset = fp->file_size+off; + fp->is_ready = 0; + return 0; + } + else if (fp->type == KNF_TYPE_HTTP) + { + if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? + fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n"); + errno = ESPIPE; + return -1; + } + if (whence==SEEK_CUR) + fp->offset += off; + else if (whence==SEEK_SET) + fp->offset = off; + fp->is_ready = 0; + return fp->offset; + } + errno = EINVAL; + fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); + return -1; +} + +int knet_close(knetFile *fp) +{ + if (fp == 0) return 0; + if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific + if (fp->fd != -1) { + /* On Linux/Mac, netclose() is an alias of close(), but on + * Windows, it is an alias of closesocket(). */ + if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); + else netclose(fp->fd); + } + free(fp->host); free(fp->port); + free(fp->response); free(fp->retr); // FTP specific + free(fp->path); free(fp->http_host); // HTTP specific + free(fp); + return 0; +} + +#ifdef KNETFILE_MAIN +int main(void) +{ + char *buf; + knetFile *fp; + int type = 4, l; +#ifdef _WIN32 + knet_win32_init(); +#endif + buf = calloc(0x100000, 1); + if (type == 0) { + fp = knet_open("knetfile.c", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 1) { // NCBI FTP, large file + fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); + knet_seek(fp, 2500000000ll, SEEK_SET); + l = knet_read(fp, buf, 255); + } else if (type == 2) { + fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 3) { + fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 4) { + fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); + knet_read(fp, buf, 10000); + knet_seek(fp, 20000, SEEK_SET); + knet_seek(fp, 10000, SEEK_SET); + l = knet_read(fp, buf+10000, 10000000) + 10000; + } + if (type != 4 && type != 1) { + knet_read(fp, buf, 255); + buf[255] = 0; + printf("%s\n", buf); + } else write(fileno(stdout), buf, l); + knet_close(fp); + free(buf); + return 0; +} +#endif diff --git a/knetfile.h b/knetfile.h new file mode 100644 index 0000000..0a0e66f --- /dev/null +++ b/knetfile.h @@ -0,0 +1,75 @@ +#ifndef KNETFILE_H +#define KNETFILE_H + +#include +#include + +#ifndef _WIN32 +#define netread(fd, ptr, len) read(fd, ptr, len) +#define netwrite(fd, ptr, len) write(fd, ptr, len) +#define netclose(fd) close(fd) +#else +#include +#define netread(fd, ptr, len) recv(fd, ptr, len, 0) +#define netwrite(fd, ptr, len) send(fd, ptr, len, 0) +#define netclose(fd) closesocket(fd) +#endif + +// FIXME: currently I/O is unbuffered + +#define KNF_TYPE_LOCAL 1 +#define KNF_TYPE_FTP 2 +#define KNF_TYPE_HTTP 3 + +typedef struct knetFile_s { + int type, fd; + int64_t offset; + char *host, *port; + + // the following are for FTP only + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; // for lazy seek + int64_t file_size; + + // the following are for HTTP only + char *path, *http_host; +} knetFile; + +#define knet_tell(fp) ((fp)->offset) +#define knet_fileno(fp) ((fp)->fd) + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 + int knet_win32_init(); + void knet_win32_destroy(); +#endif + + knetFile *knet_open(const char *fn, const char *mode); + + /* + This only works with local files. + */ + knetFile *knet_dopen(int fd, const char *mode); + + /* + If ->is_ready==0, this routine updates ->fd; otherwise, it simply + reads from ->fd. + */ + off_t knet_read(knetFile *fp, void *buf, off_t len); + + /* + This routine only sets ->offset and ->is_ready=0. It does not + communicate with the FTP server. + */ + off_t knet_seek(knetFile *fp, int64_t off, int whence); + int knet_close(knetFile *fp); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/ksort.h b/ksort.h new file mode 100644 index 0000000..16a03fd --- /dev/null +++ b/ksort.h @@ -0,0 +1,271 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* + 2008-11-16 (0.1.4): + + * Fixed a bug in introsort() that happens in rare cases. + + 2008-11-05 (0.1.3): + + * Fixed a bug in introsort() for complex comparisons. + + * Fixed a bug in mergesort(). The previous version is not stable. + + 2008-09-15 (0.1.2): + + * Accelerated introsort. On my Mac (not on another Linux machine), + my implementation is as fast as std::sort on random input. + + * Added combsort and in introsort, switch to combsort if the + recursion is too deep. + + 2008-09-13 (0.1.1): + + * Added k-small algorithm + + 2008-09-05 (0.1.0): + + * Initial version + +*/ + +#ifndef AC_KSORT_H +#define AC_KSORT_H + +#include +#include + +typedef struct { + void *left, *right; + int depth; +} ks_isort_stack_t; + +#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } + +#define KSORT_INIT(name, type_t, __sort_lt) \ + void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ + { \ + type_t *a2[2], *a, *b; \ + int curr, shift; \ + \ + a2[0] = array; \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ + ks_heapadjust_##name(i, lsize, l); \ + } \ + void ks_heapsort_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = lsize - 1; i > 0; --i) { \ + type_t tmp; \ + tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ + } \ + } \ + inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + { \ + type_t *i, *j, swap_tmp; \ + for (i = s + 1; i < t; ++i) \ + for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ + swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ + } \ + } \ + void ks_combsort_##name(size_t n, type_t a[]) \ + { \ + const double shrink_factor = 1.2473309501039786540366528676643; \ + int do_swap; \ + size_t gap = n; \ + type_t tmp, *i, *j; \ + do { \ + if (gap > 2) { \ + gap = (size_t)(gap / shrink_factor); \ + if (gap == 9 || gap == 10) gap = 11; \ + } \ + do_swap = 0; \ + for (i = a; i < a + n - gap; ++i) { \ + j = i + gap; \ + if (__sort_lt(*j, *i)) { \ + tmp = *i; *i = *j; *j = tmp; \ + do_swap = 1; \ + } \ + } \ + } while (do_swap || gap > 2); \ + if (gap != 1) __ks_insertsort_##name(a, a + n); \ + } \ + void ks_introsort_##name(size_t n, type_t a[]) \ + { \ + int d; \ + ks_isort_stack_t *top, *stack; \ + type_t rp, swap_tmp; \ + type_t *s, *t, *i, *j, *k; \ + \ + if (n < 1) return; \ + else if (n == 2) { \ + if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ + return; \ + } \ + for (d = 2; 1ul<>1) + 1; \ + if (__sort_lt(*k, *i)) { \ + if (__sort_lt(*k, *j)) k = j; \ + } else k = __sort_lt(*j, *i)? i : j; \ + rp = *k; \ + if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ + for (;;) { \ + do ++i; while (__sort_lt(*i, rp)); \ + do --j; while (i <= j && __sort_lt(rp, *j)); \ + if (j <= i) break; \ + swap_tmp = *i; *i = *j; *j = swap_tmp; \ + } \ + swap_tmp = *i; *i = *t; *t = swap_tmp; \ + if (i-s > t-i) { \ + if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ + s = t-i > 16? i+1 : t; \ + } else { \ + if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ + t = i-s > 16? i-1 : s; \ + } \ + } else { \ + if (top == stack) { \ + free(stack); \ + __ks_insertsort_##name(a, a+n); \ + return; \ + } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ + } \ + } \ + } \ + /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ + /* 0 <= kk < n */ \ + type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ + { \ + type_t *low, *high, *k, *ll, *hh, *mid; \ + low = arr; high = arr + n - 1; k = arr + kk; \ + for (;;) { \ + if (high <= low) return *k; \ + if (high == low + 1) { \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + return *k; \ + } \ + mid = low + (high - low) / 2; \ + if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ + KSORT_SWAP(type_t, *mid, *(low+1)); \ + ll = low + 1; hh = high; \ + for (;;) { \ + do ++ll; while (__sort_lt(*ll, *low)); \ + do --hh; while (__sort_lt(*low, *hh)); \ + if (hh < ll) break; \ + KSORT_SWAP(type_t, *ll, *hh); \ + } \ + KSORT_SWAP(type_t, *low, *hh); \ + if (hh <= k) low = ll; \ + if (hh >= k) high = hh - 1; \ + } \ + } + +#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) +#define ks_introsort(name, n, a) ks_introsort_##name(n, a) +#define ks_combsort(name, n, a) ks_combsort_##name(n, a) +#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) +#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) +#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) +#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) + +#define ks_lt_generic(a, b) ((a) < (b)) +#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) + +typedef const char *ksstr_t; + +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) +#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) + +#endif diff --git a/kstring.c b/kstring.c new file mode 100644 index 0000000..e0203fa --- /dev/null +++ b/kstring.c @@ -0,0 +1,165 @@ +#include +#include +#include +#include +#include +#include "kstring.h" + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. + va_end(ap); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + } + va_end(ap); + s->l += l; + return l; +} + +// s MUST BE a null terminated string; l = strlen(s) +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) +{ + int i, n, max, last_char, last_start, *offsets, l; + n = 0; max = *_max; offsets = *_offsets; + l = strlen(s); + +#define __ksplit_aux do { \ + if (_offsets) { \ + s[i] = 0; \ + if (n == max) { \ + max = max? max<<1 : 2; \ + offsets = (int*)realloc(offsets, sizeof(int) * max); \ + } \ + offsets[n++] = last_start; \ + } else ++n; \ + } while (0) + + for (i = 0, last_char = last_start = 0; i <= l; ++i) { + if (delimiter == 0) { + if (isspace(s[i]) || s[i] == 0) { + if (isgraph(last_char)) __ksplit_aux; // the end of a field + } else { + if (isspace(last_char) || last_char == 0) last_start = i; + } + } else { + if (s[i] == delimiter || s[i] == 0) { + if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field + } else { + if (last_char == delimiter || last_char == 0) last_start = i; + } + } + last_char = s[i]; + } + *_max = max; *_offsets = offsets; + return n; +} + +/********************** + * Boyer-Moore search * + **********************/ + +// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html +int *ksBM_prep(const uint8_t *pat, int m) +{ + int i, *suff, *prep, *bmGs, *bmBc; + prep = calloc(m + 256, 1); + bmGs = prep; bmBc = prep + m; + { // preBmBc() + for (i = 0; i < 256; ++i) bmBc[i] = m; + for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; + } + suff = calloc(m, sizeof(int)); + { // suffixes() + int f = 0, g; + suff[m - 1] = m; + g = m - 1; + for (i = m - 2; i >= 0; --i) { + if (i > g && suff[i + m - 1 - f] < i - g) + suff[i] = suff[i + m - 1 - f]; + else { + if (i < g) g = i; + f = i; + while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; + suff[i] = f - g; + } + } + } + { // preBmGs() + int j = 0; + for (i = 0; i < m; ++i) bmGs[i] = m; + for (i = m - 1; i >= 0; --i) + if (suff[i] == i + 1) + for (; j < m - 1 - i; ++j) + if (bmGs[j] == m) + bmGs[j] = m - 1 - i; + for (i = 0; i <= m - 2; ++i) + bmGs[m - 1 - suff[i]] = m - 1 - i; + } + free(suff); + return prep; +} + +int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches) +{ + int i, j, *prep, *bmGs, *bmBc; + int *matches = 0, mm = 0, nm = 0; + prep = _prep? _prep : ksBM_prep(pat, m); + bmGs = prep; bmBc = prep + m; + j = 0; + while (j <= n - m) { + for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); + if (i < 0) { + if (nm == mm) { + mm = mm? mm<<1 : 1; + matches = realloc(matches, mm * sizeof(int)); + } + matches[nm++] = j; + j += bmGs[0]; + } else { + int max = bmBc[str[i+j]] - m + 1 + i; + if (max < bmGs[i]) max = bmGs[i]; + j += max; + } + } + *n_matches = nm; + if (_prep == 0) free(prep); + return matches; +} + +#ifdef KSTRING_MAIN +#include +int main() +{ + kstring_t *s; + int *fields, n, i; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + // test ksprintf() + ksprintf(s, " abcdefg: %d ", 100); + printf("'%s'\n", s->s); + // test ksplit() + fields = ksplit(s, 0, &n); + for (i = 0; i < n; ++i) + printf("field[%d] = '%s'\n", i, s->s + fields[i]); + free(s); + + { + static char *str = "abcdefgcdg"; + static char *pat = "cd"; + int n, *matches; + matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n); + printf("%d: \n", n); + for (i = 0; i < n; ++i) + printf("- %d\n", matches[i]); + free(matches); + } + return 0; +} +#endif diff --git a/kstring.h b/kstring.h new file mode 100644 index 0000000..f4e5a99 --- /dev/null +++ b/kstring.h @@ -0,0 +1,68 @@ +#ifndef KSTRING_H +#define KSTRING_H + +#include +#include +#include + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +int ksprintf(kstring_t *s, const char *fmt, ...); +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + +// calculate the auxiliary array, allocated by calloc() +int *ksBM_prep(const uint8_t *pat, int m); + +/* Search pat in str and returned the list of matches. The size of the + * list is returned as n_matches. _prep is the array returned by + * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */ +int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches); + +static inline int kputsn(const char *p, int l, kstring_t *s) +{ + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + strncpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; +} + +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +static inline int *ksplit(kstring_t *s, int delimiter, int *n) +{ + int max = 0, *offsets = 0; + *n = ksplit_core(s->s, delimiter, &max, &offsets); + return offsets; +} + +#endif diff --git a/main.c b/main.c new file mode 100644 index 0000000..faef7fb --- /dev/null +++ b/main.c @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include "bgzf.h" +#include "tabix.h" + +#define PACKAGE_VERSION "0.1.0 (r506)" + +static int fetch_func(int l, const char *s, void *data) +{ + printf("%s\n", s); + return 0; +} + +int main(int argc, char *argv[]) +{ + int c, skip = -1, meta = -1; + ti_conf_t conf = ti_conf_gff; + while ((c = getopt(argc, argv, "p:s:b:e:0S:c:")) >= 0) { + switch (c) { + case '0': conf.preset |= TI_FLAG_UCSC; break; + case 'S': skip = atoi(optarg); break; + case 'c': meta = optarg[0]; break; + case 'p': + if (strcmp(optarg, "gff") == 0) conf = ti_conf_gff; + else if (strcmp(optarg, "bed") == 0) conf = ti_conf_bed; + else if (strcmp(optarg, "sam") == 0) conf = ti_conf_sam; + else if (strcmp(optarg, "vcf") == 0) conf = ti_conf_vcf; + else if (strcmp(optarg, "psltbl") == 0) conf = ti_conf_psltbl; + else { + fprintf(stderr, "[main] unrecognized preset '%s'\n", optarg); + return 1; + } + break; + case 's': conf.sc = atoi(optarg); break; + case 'b': conf.bc = atoi(optarg); break; + case 'e': conf.ec = atoi(optarg); break; + } + } + if (skip >= 0) conf.line_skip = skip; + if (meta >= 0) conf.meta_char = meta; + if (optind == argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Program: tabix (TAB-delimited file InderXer)\n"); + fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION); + fprintf(stderr, "Usage: tabix [region1 [region2 [...]]]\n\n"); + fprintf(stderr, "Options: -p STR preset: gff, bed, sam, vcf, psltbl [gff]\n"); + fprintf(stderr, " -s INT sequence name column [1]\n"); + fprintf(stderr, " -b INT start column [4]\n"); + fprintf(stderr, " -e INT end column [5]\n"); + fprintf(stderr, " -S INT skip first INT lines [0]\n"); + fprintf(stderr, " -c CHAR symbol for comment/meta lines [#]\n"); + fprintf(stderr, " -0 zero-based coordinate\n"); + fprintf(stderr, "\n"); + return 1; + } + if (optind + 1 == argc) + return ti_index_build(argv[optind], &conf); + { // retrieve + BGZF *fp; + fp = bgzf_open(argv[optind], "r"); + if (fp == 0) { + fprintf(stderr, "[main] fail to open the data file.\n"); + return 1; + } + if (strcmp(argv[optind+1], ".") == 0) { // retrieve all + kstring_t *str = calloc(1, sizeof(kstring_t)); + while (ti_readline(fp, str) >= 0) { // FIXME: check return code for error + fputs(str->s, stdout); fputc('\n', stdout); + } + free(str->s); free(str); + } else { // retrieve from specified regions + ti_index_t *idx; + int i; + idx = ti_index_load(argv[optind]); + if (idx == 0) { + bgzf_close(fp); + fprintf(stderr, "[main] fail to load the index.\n"); + return 1; + } + for (i = optind + 1; i < argc; ++i) { + int tid, beg, end; + if (ti_parse_region(idx, argv[i], &tid, &beg, &end) == 0) { + ti_fetch(fp, idx, tid, beg, end, 0, fetch_func); + } else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n"); + } + ti_index_destroy(idx); + } + bgzf_close(fp); + } + return 0; +} diff --git a/tabix.1 b/tabix.1 new file mode 100644 index 0000000..6b6e2d6 --- /dev/null +++ b/tabix.1 @@ -0,0 +1,117 @@ +.TH tabix 1 "2 November 2009" "tabix-0.1.0" "Bioinformatics tools" +.SH NAME +.PP +bgzip - Block compression/decompression utility +.PP +tabix - Generic indexer for TAB-delimited genome position files +.SH SYNOPSIS +.PP +.B bgzip +.RB [ \-cdh ] +.RB [ \-b +.IR virtualOffset ] +.RB [ \-s +.IR size ] +.RI [ file ] +.PP +.B tabix +.RB [ \-0 ] +.RB [ \-p +.R gff|bed|sam|vcf] +.RB [ \-s +.IR seqCol ] +.RB [ \-b +.IR begCol ] +.RB [ \-e +.IR endCol ] +.RB [ \-S +.IR lineSkip ] +.RB [ \-c +.IR metaChar ] +.I in.tab.bgz +.RI [ "region1 " [ "region2 " [ ... "]]]" + +.SH DESCRIPTION +.PP +Tabix indexes a TAB-delimited genome position file +.I in.tab.bgz +and creates an index file +.I in.tab.bgz.tbi +when +.I region +is absent from the command-line. The input data file must be position +sorted and compressed by +.B bgzip +which has a +.BR gzip (1) +like interface. After indexing, tabix is able to quickly retrieve data +lines overlapping +.I regions +specified in the format "chr:beginPos-endPos". Fast data retrieval also +works over network if URI is given as a file name and in this case the +index file will be downloaded if it is not present locally. + +.SH OPTIONS OF TABIX +.TP 10 +.BI "-p " STR +Input format for indexing. Valid values are: gff, bed, sam, vcf and +psltab. This option should not be applied together with any of +.BR \-s ", " \-b ", " \-e ", " \-c " and " \-0 ; +it is not used for data retrieval because this setting is stored in +the index file. [gff] +.TP +.BI "-s " INT +Column of sequence name. Option +.BR \-s ", " \-b ", " \-e ", " \-S ", " \-c " and " \-0 +are all stored in the index file and thus not used in data retrieval. [1] +.TP +.BI "-b " INT +Column of start chromosomal position. [4] +.TP +.BI "-e " INT +Column of end chromosomal position. [5] +.TP +.BI "-S " INT +Skip first INT lines in the data file. [0] +.TP +.BI "-c " CHAR +Skip lines started with character CHAR. [#] +.TP +.B -0 +Specify that the position in the data file is 0-based (e.g. UCSC files) +rather than 1-based. +.RE + +.SH EXAMPLE +grep -v ^"#" unsorted.gff | sort -k1,1 -k4,4n | bgzip -c > sorted.gff.gz; + +tabix -p gff sorted.gff.gz; + +tabix sorted.gff.gz chr1:10,000,000-20,000,000; + +.SH NOTES +It is straightforward to achieve overlap queries using the standard +B-tree index (with or without binning) implemented in all SQL databases, +or the R-tree index in PostgreSQL and Oracle. But there are still many +reasons to use tabix. Firstly, tabix directly works with a lot of widely +used TAB-delimited formats such as GFF/GTF and BED. We do not need to +design database schema or specialized binary formats. Data do not need +to be duplicated in different formats, either. Secondly, tabix works on +compressed data files while most SQL databases do not. The GenCode +annotation GTF can be compressed down to 4%. Thirdly, tabix is +fast. The same indexing algorithm is known to work efficiently for an +alignment with a few billion short reads. SQL databases probably cannot +easily handle data at this scale. Last but not the least, tabix supports +remote data retrieval. One can put the data file and the index at an FTP +or HTTP server, and other users or even web services will be able to get +a slice without downloading the entire file. + +.SH AUTHOR +.PP +Tabix was written by Heng Li. The BGZF library was originally +implemented by Bob Handsaker and modified by Heng Li for remote file +access and in-memory caching. + +.SH SEE ALSO +.PP +.BR samtools (1) diff --git a/tabix.h b/tabix.h new file mode 100644 index 0000000..e608c79 --- /dev/null +++ b/tabix.h @@ -0,0 +1,43 @@ +#ifndef __TABIDX_H +#define __TABIDX_H + +#include +#include "kstring.h" +#include "bgzf.h" + +#define TI_PRESET_GENERIC 0 +#define TI_PRESET_SAM 1 +#define TI_PRESET_VCF 2 + +#define TI_FLAG_UCSC 0x10000 + +typedef int (*ti_fetch_f)(int l, const char *s, void *data); + +struct __ti_index_t; +typedef struct __ti_index_t ti_index_t; + +typedef struct { + int32_t preset; + int32_t sc, bc, ec; + int32_t meta_char, line_skip; +} ti_conf_t; + +extern ti_conf_t ti_conf_gff, ti_conf_bed, ti_conf_psltbl, ti_conf_vcf, ti_conf_sam; + +#ifdef __cplusplus +extern "C" { +#endif + + int ti_readline(BGZF *fp, kstring_t *str); + + int ti_index_build(const char *fn, const ti_conf_t *conf); + ti_index_t *ti_index_load(const char *fn); + void ti_index_destroy(ti_index_t *idx); + int ti_parse_region(ti_index_t *idx, const char *str, int *tid, int *begin, int *end); + int ti_fetch(BGZF *fp, const ti_index_t *idx, int tid, int beg, int end, void *data, ti_fetch_f func); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/tabix.txt b/tabix.txt new file mode 100644 index 0000000..5ba263b --- /dev/null +++ b/tabix.txt @@ -0,0 +1,89 @@ +tabix(1) Bioinformatics tools tabix(1) + + + +NAME + bgzip - Block compression/decompression utility + + tabix - Generic indexer for TAB-delimited genome position files + +SYNOPSIS + bgzip [-cdh] [-b virtualOffset] [-s size] [file] + + tabix [-0] [-p gff|bed|sam|vcf] [-s seqCol] [-b begCol] [-e endCol] [-S + lineSkip] [-c metaChar] in.tab.bgz [region1 [region2 [...]]] + + +DESCRIPTION + Tabix indexes a TAB-delimited genome position file in.tab.bgz and cre- + ates an index file in.tab.bgz.tbi when region is absent from the com- + mand-line. The input data file must be position sorted and compressed + by bgzip which has a gzip(1) like interface. After indexing, tabix is + able to quickly retrieve data lines overlapping regions specified in + the format "chr:beginPos-endPos". Fast data retrieval also works over + network if URI is given as a file name and in this case the index file + will be downloaded if it is not present locally. + + +OPTIONS OF TABIX + -p STR Input format for indexing. Valid values are: gff, bed, sam, + vcf and psltab. This option should not be applied together + with any of -s, -b, -e, -c and -0; it is not used for data + retrieval because this setting is stored in the index file. + [gff] + + -s INT Column of sequence name. Option -s, -b, -e, -S, -c and -0 are + all stored in the index file and thus not used in data + retrieval. [1] + + -b INT Column of start chromosomal position. [4] + + -e INT Column of end chromosomal position. [5] + + -S INT Skip first INT lines in the data file. [0] + + -c CHAR Skip lines started with character CHAR. [#] + + -0 Specify that the position in the data file is 0-based (e.g. + UCSC files) rather than 1-based. + + +EXAMPLE + grep -v ^"#" unsorted.gff | sort -k1,1 -k4,4n | bgzip -c > + sorted.gff.gz; + + tabix -p gff sorted.gff.gz; + + tabix sorted.gff.gz chr1:10,000,000-20,000,000; + + +NOTES + It is straightforward to achieve overlap queries using the standard B- + tree index (with or without binning) implemented in all SQL databases, + or the R-tree index in PostgreSQL and Oracle. But there are still many + reasons to use tabix. Firstly, tabix directly works with a lot of + widely used TAB-delimited formats such as GFF/GTF and BED. We do not + need to design database schema or specialized binary formats. Data do + not need to be duplicated in different formats, either. Secondly, tabix + works on compressed data files while most SQL databases do not. The + GenCode annotation GTF can be compressed down to 4%. Thirdly, tabix is + fast. The same indexing algorithm is known to work efficiently for an + alignment with a few billion short reads. SQL databases probably cannot + easily handle data at this scale. Last but not the least, tabix sup- + ports remote data retrieval. One can put the data file and the index at + an FTP or HTTP server, and other users or even web services will be + able to get a slice without downloading the entire file. + + +AUTHOR + Tabix was written by Heng Li. The BGZF library was originally imple- + mented by Bob Handsaker and modified by Heng Li for remote file access + and in-memory caching. + + +SEE ALSO + samtools(1) + + + +tabix-0.1.0 2 November 2009 tabix(1) -- 2.30.2