X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=pysam.git;a=blobdiff_plain;f=samtools%2Fkseq.h;h=82face095919a3991b1f927bb5422ffd95f102a4;hp=0bbc7dc9370e910b19286167bd0862c64855387c;hb=e1756c41e7a1d7cc01fb95e42df9dd04da2d2991;hpb=ca46ef4ba4a883c57cea62d5bf1bc021f1185109 diff --git a/samtools/kseq.h b/samtools/kseq.h index 0bbc7dc..82face0 100644 --- a/samtools/kseq.h +++ b/samtools/kseq.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, 2009, 2011 Attractive Chaos + Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,7 +23,13 @@ SOFTWARE. */ -/* Last Modified: 18AUG2011 */ +/* Contact: Heng Li */ + +/* + 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*" + */ + +/* Last Modified: 12APR2009 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -88,10 +94,10 @@ typedef struct __kstring_t { #endif #define __KS_GETUNTIL(__read, __bufsize) \ - static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ + static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ { \ if (dret) *dret = 0; \ - str->l = append? str->l : 0; \ + str->l = 0; \ if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ @@ -126,15 +132,13 @@ typedef struct __kstring_t { break; \ } \ } \ - if (str->s == 0) { \ + if (str->l == 0) { \ str->m = 1; \ str->s = (char*)calloc(1, 1); \ } \ str->s[str->l] = '\0'; \ return str->l; \ - } \ - static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ - { return ks_getuntil2(ks, delimiter, str, dret, 0); } + } #define KSTREAM_INIT(type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ @@ -167,45 +171,44 @@ typedef struct __kstring_t { -1 end-of-file -2 truncated quality string */ -#define __KSEQ_READ \ - static int kseq_read(kseq_t *seq) \ - { \ - int c; \ - kstream_t *ks = seq->f; \ +#define __KSEQ_READ \ + static int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* else: the first header char has been read in the previous call */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ - if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); /* read FASTA/Q comment */ \ - if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ - seq->seq.m = 256; \ - seq->seq.s = (char*)malloc(seq->seq.m); \ - } \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* the first header char has been read */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ + if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ - seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ - ks_getuntil2(ks, '\n', &seq->seq, 0, 1); /* read the rest of the line */ \ - } \ + if (isgraph(c)) { /* printable non-space character */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l++] = (char)c; \ + } \ + } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ - if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ - } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ - if (c == -1) return -2; /* error: no quality string */ \ - while (ks_getuntil2(ks, '\n', &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + if (c == -1) return -2; /* we should not stop here */ \ + while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ + if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ + seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ seq->last_char = 0; /* we have not come to the next header line */ \ - if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ - return seq->seq.l; \ + if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ + return seq->seq.l; \ } #define __KSEQ_TYPE(type_t) \ @@ -216,7 +219,7 @@ typedef struct __kstring_t { } kseq_t; #define KSEQ_INIT(type_t, __read) \ - KSTREAM_INIT(type_t, __read, 16384) \ + KSTREAM_INIT(type_t, __read, 4096) \ __KSEQ_TYPE(type_t) \ __KSEQ_BASIC(type_t) \ __KSEQ_READ