Imported Upstream version 0.1.18
authorCharles Plessy <plessy@debian.org>
Tue, 6 Sep 2011 06:48:39 +0000 (15:48 +0900)
committerCharles Plessy <plessy@debian.org>
Tue, 6 Sep 2011 06:48:39 +0000 (15:48 +0900)
25 files changed:
Makefile.mingw
NEWS
bam.c
bam.h
bam2bcf.c
bam2bcf.h
bam2bcf_indel.c
bam_aux.c
bam_import.c
bam_index.c
bam_md.c
bam_pileup.c
bcftools/bcf.h
bcftools/bcfutils.c
bcftools/call1.c
bcftools/em.c
bcftools/mut.c
bedidx.c
kprobaln.c
kseq.h
misc/Makefile
misc/seqtk.c [new file with mode: 0644]
misc/wgsim.c
sam_view.c
sample.c

index 836360f3df3bfdb8587c67ef42c3402eef4ba985..7a57ffc0c7457c5fcfb65dd8585bd3ee3cb1114c 100644 (file)
@@ -4,14 +4,15 @@ CFLAGS=               -g -Wall -O2
 DFLAGS=                -D_USE_KNETFILE -D_CURSES_LIB=2
 KNETFILE_O=    knetfile.o
 LOBJS=         bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
-                       bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \
-                       $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o
-AOBJS=         bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
+                       bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o \
+                       $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bedidx.o
+AOBJS=         bam_tview.o bam_plcmd.o sam_view.o \
                        bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o     \
-                       bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o
+                       bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
+                       cut_target.o phase.o bam_cat.o bam2depth.o
 BCFOBJS=       bcftools/bcf.o bcftools/fet.o bcftools/bcf2qcall.o bcftools/bcfutils.o \
-                       bcftools/call1.o bcftools/index.o bcftools/kfunc.o bcftools/ld.o \
-                       bcftools/prob1.o bcftools/vcf.o
+                       bcftools/call1.o bcftools/index.o bcftools/kfunc.o bcftools/em.o \
+                       bcftools/kmin.o bcftools/prob1.o bcftools/vcf.o bcftools/mut.o
 PROG=          samtools.exe bcftools.exe
 INCLUDES=      -I. -Iwin32
 SUBDIRS=       .
@@ -35,22 +36,20 @@ libbam.a:$(LOBJS)
 samtools.exe:$(AOBJS) libbam.a $(BCFOBJS)
                $(CC) $(CFLAGS) -o $@ $(AOBJS) $(BCFOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32
 
-bcftools.exe:$(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o
-               $(CC) $(CFLAGS) -o $@ $(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o -lm -Lwin32 -lz -lws2_32
+bcftools.exe:$(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o
+               $(CC) $(CFLAGS) -o $@ $(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o -lm -Lwin32 -lz -lws2_32
 
 razip.o:razf.h
 bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h
 sam.o:sam.h bam.h
 bam_import.o:bam.h kseq.h khash.h razf.h
 bam_pileup.o:bam.h razf.h ksort.h
-bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h bcftools/bcf.h bam2bcf.h
+bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h
 bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h
 bam_lpileup.o:bam.h ksort.h
-bam_tview.o:bam.h faidx.h bam_maqcns.h
-bam_maqcns.o:bam.h ksort.h bam_maqcns.h kaln.h
+bam_tview.o:bam.h faidx.h
 bam_sort.o:bam.h ksort.h razf.h
 bam_md.o:bam.h faidx.h
-glf.o:glf.h
 sam_header.o:sam_header.h khash.h
 bcf.o:bcftools/bcf.h
 bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h
diff --git a/NEWS b/NEWS
index 1aa23593df188b1827f6e46bf1779874552fd100..41a6cc8ebe05185ea8e97f0639fede7ce15675d9 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,30 @@
+Beta Release 0.1.18 (2 September, 2011)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes in samtools:
+
+ * Support the new =/X CIGAR operators (by Peter Cock).
+
+ * Allow to subsample BAM while keeping the pairing intact (view -s).
+
+ * Implemented variant distance bias as a new filter (by Petr Danecek).
+
+ * Bugfix: huge memory usage during indexing
+
+ * Bugfix: use of uninitialized variable in mpileup (rare)
+
+ * Bugfix: wrong BAQ probability (rare)
+
+Notable changes in bcftools:
+
+ * Support indel in the contrast caller.
+
+ * Bugfix: LRT2=nan in rare cases
+
+(0.1.18: 2 September 2011, r982:295)
+
+
+
 Beta Release 0.1.17 (6 July, 2011)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/bam.c b/bam.c
index 5fac17df6ec27d7cbb04b5600c3ce54d5c06b354..0055e8427667dd95151ea1a1bef5dd14e74e15f1 100644 (file)
--- a/bam.c
+++ b/bam.c
@@ -32,7 +32,7 @@ int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar)
        int32_t l = 0;
        for (k = 0; k < c->n_cigar; ++k) {
                int op = cigar[k] & BAM_CIGAR_MASK;
-               if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP)
+               if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF)
                        l += cigar[k] >> BAM_CIGAR_SHIFT;
        }
        return l;
@@ -268,7 +268,7 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of)
        else {
                for (i = 0; i < c->n_cigar; ++i) {
                        kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str);
-                       kputc("MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str);
+                       kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str);
                }
        }
        kputc('\t', &str);
diff --git a/bam.h b/bam.h
index 246b020d679a07f073cfbae8dc40a8d57b501d25..346c750c8c807c44ea32804eeb1bb93411d0b71a 100644 (file)
--- a/bam.h
+++ b/bam.h
@@ -40,7 +40,7 @@
   @copyright Genome Research Ltd.
  */
 
-#define BAM_VERSION "0.1.17 (r973:277)"
+#define BAM_VERSION "0.1.18 (r982:295)"
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -134,20 +134,25 @@ typedef struct {
 /*
   CIGAR operations.
  */
-/*! @abstract CIGAR: match */
+/*! @abstract CIGAR: M = match or mismatch*/
 #define BAM_CMATCH      0
-/*! @abstract CIGAR: insertion to the reference */
+/*! @abstract CIGAR: I = insertion to the reference */
 #define BAM_CINS        1
-/*! @abstract CIGAR: deletion from the reference */
+/*! @abstract CIGAR: D = deletion from the reference */
 #define BAM_CDEL        2
-/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */
+/*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */
 #define BAM_CREF_SKIP   3
-/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */
+/*! @abstract CIGAR: S = clip on the read with clipped sequence
+  present in qseq */
 #define BAM_CSOFT_CLIP  4
-/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */
+/*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */
 #define BAM_CHARD_CLIP  5
-/*! @abstract CIGAR: padding */
+/*! @abstract CIGAR: P = padding */
 #define BAM_CPAD        6
+/*! @abstract CIGAR: equals = match */
+#define BAM_CEQUAL        7
+/*! @abstract CIGAR: X = mismatch */
+#define BAM_CDIFF        8
 
 /*! @typedef
   @abstract Structure for core alignment information.
index f263fad55a5d1f1438fd50d127fb2362309f100b..dec3305340f689a100d29dc4f9b5998e7692c6d4 100644 (file)
--- a/bam2bcf.c
+++ b/bam2bcf.c
@@ -39,6 +39,7 @@ void bcf_call_destroy(bcf_callaux_t *bca)
  * negative if we are looking at an indel. */
 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
 {
+    static int *var_pos = NULL, nvar_pos = 0;
        int i, n, ref4, is_indel, ori_depth = 0;
        memset(r, 0, sizeof(bcf_callret1_t));
        if (ref_base >= 0) {
@@ -94,9 +95,92 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
        r->depth = n; r->ori_depth = ori_depth;
        // glfgen
        errmod_cal(bca->e, n, 5, bca->bases, r->p);
+
+    // Calculate the Variant Distance Bias (make it optional?)
+    if ( nvar_pos < _n ) {
+        nvar_pos = _n;
+        var_pos = realloc(var_pos,sizeof(int)*nvar_pos);
+    }
+    int alt_dp=0, read_len=0;
+    for (i=0; i<_n; i++) {
+        const bam_pileup1_t *p = pl + i;
+        if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) 
+            continue;
+
+        var_pos[alt_dp] = p->qpos;
+        if ( (bam1_cigar(p->b)[0]&BAM_CIGAR_MASK)==4 )
+            var_pos[alt_dp] -= bam1_cigar(p->b)[0]>>BAM_CIGAR_SHIFT;
+
+        alt_dp++;
+        read_len += p->b->core.l_qseq;
+    }
+    float mvd=0;
+    int j;
+    n=0;
+    for (i=0; i<alt_dp; i++) {
+        for (j=0; j<i; j++) {
+            mvd += abs(var_pos[i] - var_pos[j]);
+            n++;
+        }
+    }
+    r->mvd[0] = n ? mvd/n : 0;
+    r->mvd[1] = alt_dp;
+    r->mvd[2] = alt_dp ? read_len/alt_dp : 0;
+
        return r->depth;
 }
 
+
+void calc_vdb(int n, const bcf_callret1_t *calls, bcf_call_t *call)
+{
+    // Variant distance bias. Samples merged by means of DP-weighted average.
+
+    float weight=0, tot_prob=0;
+
+    int i;
+    for (i=0; i<n; i++)
+    {
+        int mvd      = calls[i].mvd[0];
+        int dp       = calls[i].mvd[1];
+        int read_len = calls[i].mvd[2];
+
+        if ( dp<2 ) continue;
+
+        float prob = 0;
+        if ( dp==2 )
+        {
+            // Exact formula
+            prob = (mvd==0) ? 1.0/read_len : (read_len-mvd)*2.0/read_len/read_len;
+        }
+        else if ( dp==3 )
+        {
+            // Sin, quite accurate approximation
+            float mu = read_len/2.9;
+            prob = mvd>2*mu ? 0 : sin(mvd*3.14/2/mu) / (4*mu/3.14);
+        }
+        else
+        {
+            // Scaled gaussian curve, crude approximation, but behaves well. Using fixed depth for bigger depths.
+            if ( dp>5 )
+                dp = 5;
+            float sigma2 = (read_len/1.9/(dp+1)) * (read_len/1.9/(dp+1));
+            float norm   = 1.125*sqrt(2*3.14*sigma2);
+            float mu     = read_len/2.9;
+            if ( mvd < mu )
+                prob = exp(-(mvd-mu)*(mvd-mu)/2/sigma2)/norm;
+            else
+                prob = exp(-(mvd-mu)*(mvd-mu)/3.125/sigma2)/norm;
+        }
+
+        //fprintf(stderr,"dp=%d mvd=%d read_len=%d -> prob=%f\n", dp,mvd,read_len,prob);
+        tot_prob += prob*dp;
+        weight += dp;
+    }
+    tot_prob = weight ? tot_prob/weight : 1; 
+    //fprintf(stderr,"prob=%f\n", tot_prob);
+    call->vdb = tot_prob;
+}
+
 int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
 {
        int ref4, i, j, qsum[4];
@@ -170,6 +254,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/,
                call->ori_depth += calls[i].ori_depth;
                for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
        }
+
+    calc_vdb(n, calls, call);
+
        return 0;
 }
 
@@ -223,6 +310,10 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc
                if (i) kputc(',', &s);
                kputw(bc->anno[i], &s);
        }
+    if ( bc->vdb!=1 )
+    {
+        ksprintf(&s, ";VDB=%.4f", bc->vdb);
+    }
        kputc('\0', &s);
        // FMT
        kputs("PL", &s);
index 958567241936a4c54feae501c725ad77a6e3f1be..4af080c4753b2070192bec5e04e16174ca5bb5f8 100644 (file)
--- a/bam2bcf.h
+++ b/bam2bcf.h
@@ -26,6 +26,7 @@ typedef struct {
        int depth, ori_depth, qsum[4];
        int anno[16];
        float p[25];
+    int mvd[3]; // mean variant distance, number of variant reads, average read length
 } bcf_callret1_t;
 
 typedef struct {
@@ -33,6 +34,7 @@ typedef struct {
        int n, n_alleles, shift, ori_ref, unseen;
        int anno[16], depth, ori_depth;
        uint8_t *PL;
+    float vdb; // variant distance bias
 } bcf_call_t;
 
 #ifdef __cplusplus
index 2fdee9f75a2261266912b7fe355e9ea51892ff6d..5142b3e100aa96a7ab598a95b9953aa4c6034289 100644 (file)
@@ -66,7 +66,7 @@ static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos,
        for (k = 0; k < c->n_cigar; ++k) {
                int op = cigar[k] & BAM_CIGAR_MASK;
                int l = cigar[k] >> BAM_CIGAR_SHIFT;
-               if (op == BAM_CMATCH) {
+               if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
                        if (c->pos > tpos) return y;
                        if (x + l > tpos) {
                                *_tpos = tpos;
@@ -222,7 +222,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
                                for (k = 0; k < b->core.n_cigar; ++k) {
                                        int op = cigar[k]&0xf;
                                        int j, l = cigar[k]>>4;
-                                       if (op == BAM_CMATCH) {
+                                       if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
                                                for (j = 0; j < l; ++j)
                                                        if (x + j >= left && x + j < right)
                                                                cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
index 2247bdfe9ba15d39d5a1dcf193da9d68cdfab8bf..28b22e3059273da57dd8395fba5e29287dc3228b 100644 (file)
--- a/bam_aux.c
+++ b/bam_aux.c
@@ -59,6 +59,23 @@ int bam_aux_del(bam1_t *b, uint8_t *s)
        return 0;
 }
 
+int bam_aux_drop_other(bam1_t *b, uint8_t *s)
+{
+       if (s) {
+               uint8_t *p, *aux;
+               aux = bam1_aux(b);
+               p = s - 2;
+               __skip_tag(s);
+               memmove(aux, p, s - p);
+               b->data_len -= b->l_aux - (s - p);
+               b->l_aux = s - p;
+       } else {
+               b->data_len -= b->l_aux;
+               b->l_aux = 0;
+       }
+       return 0;
+}
+
 void bam_init_header_hash(bam_header_t *header)
 {
        if (header->hash == 0) {
index 29516c92ff42593313caf2833c38213d62ebdc52..5518a9cbba694b2ce3b24d6460938103e44fde0e 100644 (file)
@@ -14,7 +14,7 @@
 #include "kseq.h"
 #include "khash.h"
 
-KSTREAM_INIT(gzFile, gzread, 8192)
+KSTREAM_INIT(gzFile, gzread, 16384)
 KHASH_MAP_INIT_STR(ref, uint64_t)
 
 void bam_init_header_hash(bam_header_t *header);
@@ -292,20 +292,22 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
                z += str->l + 1;
                if (str->s[0] != '*') {
                        for (s = str->s; *s; ++s) {
-                               if (isalpha(*s)) ++c->n_cigar;
+                               if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar;
                                else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character");
                        }
                        b->data = alloc_data(b, doff + c->n_cigar * 4);
                        for (i = 0, s = str->s; i != c->n_cigar; ++i) {
                                x = strtol(s, &t, 10);
                                op = toupper(*t);
-                               if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
+                               if (op == 'M') op = BAM_CMATCH;
                                else if (op == 'I') op = BAM_CINS;
                                else if (op == 'D') op = BAM_CDEL;
                                else if (op == 'N') op = BAM_CREF_SKIP;
                                else if (op == 'S') op = BAM_CSOFT_CLIP;
                                else if (op == 'H') op = BAM_CHARD_CLIP;
                                else if (op == 'P') op = BAM_CPAD;
+                               else if (op == '=') op = BAM_CEQUAL;
+                               else if (op == 'X') op = BAM_CDIFF;
                                else parse_error(fp->n_lines, "invalid CIGAR operation");
                                s = t + 1;
                                bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
@@ -337,8 +339,11 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
                z += str->l + 1;
                if (strcmp(str->s, "*")) {
                        c->l_qseq = strlen(str->s);
-                       if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b)))
-                               parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
+                       if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) {
+                         fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n",
+                                 (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b)));
+                         parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
+                       }
                        p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
                        memset(p, 0, (c->l_qseq+1)/2);
                        for (i = 0; i < c->l_qseq; ++i)
index 66d8eb8a896ab5a0a4decf612e65eac158e92cbf..9610a2656031657b796abc68bf32c1b6f1564745 100644 (file)
@@ -188,7 +188,7 @@ bam_index_t *bam_index_core(bamFile fp)
                                        bam1_qname(b), last_coor, c->pos, c->tid+1);
                        return NULL;
                }
-               if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off);
+               if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
                if (c->bin != last_bin) { // then possibly write the binning index
                        if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
                                insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
index 20b1913efa09ce796392885b0f4b831c8c759f4a..d42aa8fc971fa627ccf34a781f0432214ac6f8e0 100644 (file)
--- a/bam_md.c
+++ b/bam_md.c
@@ -9,40 +9,46 @@
 #include "kaln.h"
 #include "kprobaln.h"
 
+#define USE_EQUAL 1
+#define DROP_TAG  2
+#define BIN_QUAL  4
+#define UPDATE_NM 8
+#define UPDATE_MD 16
+#define HASH_QNM  32
+
 char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
 
-void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm)
+int bam_aux_drop_other(bam1_t *b, uint8_t *s);
+
+void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
 {
        uint8_t *seq = bam1_seq(b);
        uint32_t *cigar = bam1_cigar(b);
        bam1_core_t *c = &b->core;
        int i, x, y, u = 0;
        kstring_t *str;
-       uint8_t *old_md, *old_nm;
        int32_t old_nm_i = -1, nm = 0;
 
        str = (kstring_t*)calloc(1, sizeof(kstring_t));
        for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
                int j, l = cigar[i]>>4, op = cigar[i]&0xf;
-               if (op == BAM_CMATCH) {
+               if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
                        for (j = 0; j < l; ++j) {
                                int z = y + j;
                                int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
                                if (ref[x+j] == 0) break; // out of boundary
                                if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
-                                       if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
+                                       if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
                                        ++u;
                                } else {
-                                       ksprintf(str, "%d", u);
-                                       kputc(ref[x+j], str);
+                                       kputw(u, str); kputc(ref[x+j], str);
                                        u = 0; ++nm;
                                }
                        }
                        if (j < l) break;
                        x += l; y += l;
                } else if (op == BAM_CDEL) {
-                       ksprintf(str, "%d", u);
-                       kputc('^', str);
+                       kputw(u, str); kputc('^', str);
                        for (j = 0; j < l; ++j) {
                                if (ref[x+j] == 0) break;
                                kputc(ref[x+j], str);
@@ -57,12 +63,12 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm)
                        x += l;
                }
        }
-       ksprintf(str, "%d", u);
+       kputw(u, str);
        // apply max_nm
        if (max_nm > 0 && nm >= max_nm) {
                for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
                        int j, l = cigar[i]>>4, op = cigar[i]&0xf;
-                       if (op == BAM_CMATCH) {
+                       if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
                                for (j = 0; j < l; ++j) {
                                        int z = y + j;
                                        int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
@@ -79,38 +85,54 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm)
                }
        }
        // update NM
-       old_nm = bam_aux_get(b, "NM");
-       if (c->flag & BAM_FUNMAP) return;
-       if (old_nm) old_nm_i = bam_aux2i(old_nm);
-       if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
-       else if (nm != old_nm_i) {
-               fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
-               bam_aux_del(b, old_nm);
-               bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+       if (flag & UPDATE_NM) {
+               uint8_t *old_nm = bam_aux_get(b, "NM");
+               if (c->flag & BAM_FUNMAP) return;
+               if (old_nm) old_nm_i = bam_aux2i(old_nm);
+               if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+               else if (nm != old_nm_i) {
+                       fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
+                       bam_aux_del(b, old_nm);
+                       bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+               }
        }
        // update MD
-       old_md = bam_aux_get(b, "MD");
-       if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
-       else {
-               int is_diff = 0;
-               if (strlen((char*)old_md+1) == str->l) {
-                       for (i = 0; i < str->l; ++i)
-                               if (toupper(old_md[i+1]) != toupper(str->s[i]))
-                                       break;
-                       if (i < str->l) is_diff = 1;
-               } else is_diff = 1;
-               if (is_diff) {
-                       fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s);
-                       bam_aux_del(b, old_md);
-                       bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+       if (flag & UPDATE_MD) {
+               uint8_t *old_md = bam_aux_get(b, "MD");
+               if (c->flag & BAM_FUNMAP) return;
+               if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+               else {
+                       int is_diff = 0;
+                       if (strlen((char*)old_md+1) == str->l) {
+                               for (i = 0; i < str->l; ++i)
+                                       if (toupper(old_md[i+1]) != toupper(str->s[i]))
+                                               break;
+                               if (i < str->l) is_diff = 1;
+                       } else is_diff = 1;
+                       if (is_diff) {
+                               fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s);
+                               bam_aux_del(b, old_md);
+                               bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+                       }
                }
        }
+       // drop all tags but RG
+       if (flag&DROP_TAG) {
+               uint8_t *q = bam_aux_get(b, "RG");
+               bam_aux_drop_other(b, q);
+       }
+       // reduce the resolution of base quality
+       if (flag&BIN_QUAL) {
+               uint8_t *qual = bam1_qual(b);
+               for (i = 0; i < b->core.l_qseq; ++i)
+                       if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
+       }
        free(str->s); free(str);
 }
 
-void bam_fillmd1(bam1_t *b, char *ref, int is_equal)
+void bam_fillmd1(bam1_t *b, char *ref, int flag)
 {
-       bam_fillmd1_core(b, ref, is_equal, 0);
+       bam_fillmd1_core(b, ref, flag, 0);
 }
 
 int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
@@ -124,7 +146,7 @@ int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
        mm = q = len = clip_l = clip_q = 0;
        for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
                int j, l = cigar[i]>>4, op = cigar[i]&0xf;
-               if (op == BAM_CMATCH) {
+               if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
                        for (j = 0; j < l; ++j) {
                                int z = y + j;
                                int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
@@ -197,7 +219,7 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
        for (k = 0; k < c->n_cigar; ++k) {
                int op, l;
                op = cigar[k]&0xf; l = cigar[k]>>4;
-               if (op == BAM_CMATCH) {
+               if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
                        if (yb < 0) yb = y;
                        if (xb < 0) xb = x;
                        ye = y + l; xe = x + l;
@@ -233,7 +255,7 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
                if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
                        for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
                                int op = cigar[k]&0xf, l = cigar[k]>>4;
-                               if (op == BAM_CMATCH) {
+                               if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
                                        for (i = y; i < y + l; ++i) {
                                                if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
                                                else bq[i] = bq[i] < q[i]? bq[i] : q[i];
@@ -248,7 +270,7 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
                        left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
                        for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
                                int op = cigar[k]&0xf, l = cigar[k]>>4;
-                               if (op == BAM_CMATCH) {
+                               if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
                                        for (i = y; i < y + l; ++i)
                                                bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
                                        for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
@@ -280,19 +302,24 @@ int bam_prob_realn(bam1_t *b, const char *ref)
 
 int bam_fillmd(int argc, char *argv[])
 {
-       int c, is_equal, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
+       int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
        samfile_t *fp, *fpout = 0;
        faidx_t *fai;
        char *ref = 0, mode_w[8], mode_r[8];
        bam1_t *b;
 
-       is_equal = is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
+       flt_flag = UPDATE_NM | UPDATE_MD;
+       is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
        mode_w[0] = mode_r[0] = 0;
        strcpy(mode_r, "r"); strcpy(mode_w, "w");
-       while ((c = getopt(argc, argv, "EreubSC:n:A")) >= 0) {
+       while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) {
                switch (c) {
                case 'r': is_realn = 1; break;
-               case 'e': is_equal = 1; break;
+               case 'e': flt_flag |= USE_EQUAL; break;
+               case 'd': flt_flag |= DROP_TAG; break;
+               case 'q': flt_flag |= BIN_QUAL; break;
+               case 'h': flt_flag |= HASH_QNM; break;
+               case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break;
                case 'b': is_bam_out = 1; break;
                case 'u': is_uncompressed = is_bam_out = 1; break;
                case 'S': is_sam_in = 1; break;
@@ -344,7 +371,7 @@ int bam_fillmd(int argc, char *argv[])
                                int q = bam_cap_mapQ(b, ref, capQ);
                                if (b->core.qual > q) b->core.qual = q;
                        }
-                       if (ref) bam_fillmd1_core(b, ref, is_equal, max_nm);
+                       if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm);
                }
                samwrite(fpout, b);
        }
index 3e26f74c0e7dac3012ad5805a99d2311e6d274a6..57434e098e2398fc4a6eab32b0d507d7724aa96e 100644 (file)
@@ -78,12 +78,12 @@ static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s)
        if (s->k == -1) { // never processed
                is_head = 1;
                if (c->n_cigar == 1) { // just one operation, save a loop
-                       if (_cop(cigar[0]) == BAM_CMATCH) s->k = 0, s->x = c->pos, s->y = 0;
+                 if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
                } else { // find the first match or deletion
                        for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
                                int op = _cop(cigar[k]);
                                int l = _cln(cigar[k]);
-                               if (op == BAM_CMATCH || op == BAM_CDEL) break;
+                               if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
                                else if (op == BAM_CREF_SKIP) s->x += l;
                                else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
                        }
@@ -95,16 +95,16 @@ static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s)
                if (pos - s->x >= l) { // jump to the next operation
                        assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
                        op = _cop(cigar[s->k+1]);
-                       if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) { // jump to the next without a loop
-                               if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l;
+                       if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
+                         if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
                                s->x += l;
                                ++s->k;
-                       } else { // find the next M/D/N
-                               if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l;
+                       } else { // find the next M/D/N/=/X
+                         if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
                                s->x += l;
                                for (k = s->k + 1; k < c->n_cigar; ++k) {
                                        op = _cop(cigar[k]), l = _cln(cigar[k]);
-                                       if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) break;
+                                       if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
                                        else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
                                }
                                s->k = k;
@@ -126,12 +126,12 @@ static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s)
                                for (k = s->k + 2; k < c->n_cigar; ++k) {
                                        op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
                                        if (op2 == BAM_CINS) l3 += l2;
-                                       else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP) break;
+                                       else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
                                }
                                if (l3 > 0) p->indel = l3;
                        }
                }
-               if (op == BAM_CMATCH) {
+               if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
                        p->qpos = s->y + (pos - s->x);
                } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
                        p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
index aeae6ca0de411f198f4ff76d7c29f99a19f54448..822ae5cf80fd7f9dc375259524935c933f088323 100644 (file)
@@ -28,7 +28,7 @@
 #ifndef BCF_H
 #define BCF_H
 
-#define BCF_VERSION "0.1.17 (r973:277)"
+#define BCF_VERSION "0.1.17-dev (r973:277)"
 
 #include <stdint.h>
 #include <zlib.h>
@@ -154,6 +154,8 @@ extern "C" {
        int bcf_fix_pl(bcf1_t *b);
        // convert PL to GLF-like 10-likelihood GL
        int bcf_gl10(const bcf1_t *b, uint8_t *gl);
+       // convert up to 4 INDEL alleles to GLF-like 10-likelihood GL
+       int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl);
 
        // string hash table
        void *bcf_build_refhash(bcf_hdr_t *h);
index e11cfce2663029d480d03c8bebf7b338a9b3b973..0eab4c1f322b398750fdda9da4caec6eea8e7579 100644 (file)
@@ -5,6 +5,11 @@
 #include "khash.h"
 KHASH_MAP_INIT_STR(str2id, int)
 
+#ifdef _WIN32
+#define srand48(x) srand(x)
+#define drand48() ((double)rand() / RAND_MAX)
+#endif
+
 // FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated...
 void *bcf_build_refhash(bcf_hdr_t *h)
 {
@@ -349,8 +354,6 @@ int bcf_gl10(const bcf1_t *b, uint8_t *gl)
        for (i = 0; i < b->n_smpl; ++i) {
                const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
                uint8_t *g = gl + 10 * i;
-               for (j = 0; j < PL->len; ++j)
-                       if (p[j]) break;
                for (k = j = 0; k < 4; ++k) {
                        for (l = k; l < 4; ++l) {
                                int t, x = map[k], y = map[l];
@@ -361,3 +364,27 @@ int bcf_gl10(const bcf1_t *b, uint8_t *gl)
        }
        return 0;
 }
+
+int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl)
+{
+       int k, l, j, i;
+       const bcf_ginfo_t *PL;
+       if (b->alt[0] == 0) return -1; // no alternate allele
+       for (i = 0; i < b->n_gi; ++i)
+               if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+       if (i == b->n_gi) return -1; // no PL
+       PL = b->gi + i;
+       for (i = 0; i < b->n_smpl; ++i) {
+               const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
+               uint8_t *g = gl + 10 * i;
+               for (k = j = 0; k < 4; ++k) {
+                       for (l = k; l < 4; ++l) {
+                               int t, x = k, y = l;
+                               if (x > y) t = x, x = y, y = t; // make sure x is the smaller
+                               x = y * (y+1) / 2 + x;
+                               g[j++] = x < PL->len? p[x] : 255;
+                       }
+               }
+       }
+       return 0;
+}
index b2d7d4a197b112ea10c6caf3ad4a9a13d95b1298..3cc464949eb69e630ee7905c96d4f8050b1b569c 100644 (file)
@@ -8,6 +8,11 @@
 #include "kstring.h"
 #include "time.h"
 
+#ifdef _WIN32
+#define srand48(x) srand(x)
+#define lrand48() rand()
+#endif
+
 #include "kseq.h"
 KSTREAM_INIT(gzFile, gzread, 16384)
 
@@ -126,7 +131,6 @@ static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr,
                if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]);
                if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]);
                if (em[8] >= 0) ksprintf(&s, ";LRT2=%.3g", em[8]);
-               //if (em[9] >= 0) ksprintf(&s, ";LRT1=%.3g", em[9]);
        }
        if (cons_llr > 0) {
                ksprintf(&s, ";CLR=%d", cons_llr);
@@ -269,6 +273,8 @@ static void write_header(bcf_hdr_t *h)
         kputs("##INFO=<ID=QCHI2,Number=1,Type=Integer,Description=\"Phred scaled PCHI2.\">\n", &str);
     if (!strstr(str.s, "##INFO=<ID=RP,"))
         kputs("##INFO=<ID=PR,Number=1,Type=Integer,Description=\"# permutations yielding a smaller PCHI2.\">\n", &str);
+    if (!strstr(str.s, "##INFO=<ID=VDB,"))
+        kputs("##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias\">\n", &str);
     if (!strstr(str.s, "##FORMAT=<ID=GT,"))
         kputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", &str);
     if (!strstr(str.s, "##FORMAT=<ID=GQ,"))
@@ -280,7 +286,7 @@ static void write_header(bcf_hdr_t *h)
        if (!strstr(str.s, "##FORMAT=<ID=SP,"))
                kputs("##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">\n", &str);
        if (!strstr(str.s, "##FORMAT=<ID=PL,"))
-               kputs("##FORMAT=<ID=PL,Number=-1,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods, number of values is (#ALT+1)*(#ALT+2)/2\">\n", &str);
+               kputs("##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">\n", &str);
        h->l_txt = str.l + 1; h->txt = str.s;
 }
 
@@ -490,7 +496,7 @@ int bcfview(int argc, char *argv[])
                        if (!(l > begin && end > b->pos)) continue;
                }
                ++n_processed;
-               if (vc.flag & VC_QCNT) { // summarize the difference
+               if ((vc.flag & VC_QCNT) && !is_indel) { // summarize the difference
                        int x = bcf_min_diff(b);
                        if (x > 255) x = 255;
                        if (x >= 0) ++qcnt[x];
index 32b400ff70f63ebf71e26bf94ce5bd9af8434859..b7dfe1a2889e08de37deb725dd7569d963bdab91 100644 (file)
@@ -168,7 +168,6 @@ static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3])
 // x[5..6]: group1 freq, group2 freq
 // x[7]: 1-degree P-value
 // x[8]: 2-degree P-value
-// x[9]: 1-degree P-value with freq estimated from genotypes
 int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10])
 {
        double *pdg;
@@ -208,11 +207,13 @@ int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10])
                x[6] = freqml(x[0], n1, n, pdg);
        }
        if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value
-               double f[3], f3[3][3];
+               double f[3], f3[3][3], tmp;
                f[0] = x[0]; f[1] = x[5]; f[2] = x[6];
                for (i = 0; i < 3; ++i)
                        f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i];
-               x[7] = kf_gammaq(.5, log(lk_ratio_test(n, n1, pdg, f3)));
+               tmp = log(lk_ratio_test(n, n1, pdg, f3));
+               if (tmp < 0) tmp = 0;
+               x[7] = kf_gammaq(.5, tmp);
        }
        if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value
                double g[3][3], tmp;
@@ -222,8 +223,8 @@ int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10])
                for (i = 0; i < ITER_MAX; ++i)
                        if (g3_iter(g[2], pdg, n1, n) < EPS) break;
                tmp = log(lk_ratio_test(n, n1, pdg, g));
+               if (tmp < 0) tmp = 0;
                x[8] = kf_gammaq(1., tmp);
-               x[9] = kf_gammaq(.5, tmp);
        }
        // free
        free(pdg);
index 4b7cd104a60b7bc3d4fba7086b551ac72155f350..15ef265b25b38ce3b3894eebdd52e6496cb0135b 100644 (file)
@@ -33,6 +33,7 @@ uint32_t *bcf_trio_prep(int is_x, int is_son)
        return ret;
 }
 
+
 int bcf_trio_call(const uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt)
 {
        int i, j, k;
@@ -44,7 +45,9 @@ int bcf_trio_call(const uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt)
                if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
        if (i == b->n_gi) return -1; // no PL
        gl10 = alloca(10 * b->n_smpl);
-       if (bcf_gl10(b, gl10) < 0) return -1;
+       if (bcf_gl10(b, gl10) < 0) {
+               if (bcf_gl10_indel(b, gl10) < 0) return -1;
+       }
        PL = b->gi + i;
        for (i = 0, k = 0; i < 4; ++i)
                for (j = i; j < 4; ++j)
index 34f0f2fb3ba0e40aaab31e4c8eca4140e4a5d458..ec75a1067f5905c69ead868ff42d2073189bb690 100644 (file)
--- a/bedidx.c
+++ b/bedidx.c
@@ -4,6 +4,10 @@
 #include <stdio.h>
 #include <zlib.h>
 
+#ifdef _WIN32
+#define drand48() ((double)rand() / RAND_MAX)
+#endif
+
 #include "ksort.h"
 KSORT_INIT_GENERIC(uint64_t)
 
index 5201c1af75b9c24636ee67481b0461ab7713e7a1..894a2ae625f9e0b2b9d38337b8651d8f02bcce4a 100644 (file)
@@ -161,7 +161,7 @@ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_quer
                double p = 1., Pr1 = 0.;
                for (i = 0; i <= l_query + 1; ++i) {
                        p *= s[i];
-                       if (p < 1e-100) Pr += -4.343 * log(p), p = 1.;
+                       if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.;
                }
                Pr1 += -4.343 * log(p * l_ref * l_query);
                Pr = (int)(Pr1 + .499);
diff --git a/kseq.h b/kseq.h
index 82face095919a3991b1f927bb5422ffd95f102a4..0bbc7dc9370e910b19286167bd0862c64855387c 100644 (file)
--- a/kseq.h
+++ b/kseq.h
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2008 Genome Research Ltd (GRL).
+   Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
    SOFTWARE.
 */
 
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
-
-/*
-  2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*"
- */
-
-/* Last Modified: 12APR2009 */
+/* Last Modified: 18AUG2011 */
 
 #ifndef AC_KSEQ_H
 #define AC_KSEQ_H
@@ -94,10 +88,10 @@ typedef struct __kstring_t {
 #endif
 
 #define __KS_GETUNTIL(__read, __bufsize)                                                               \
-       static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+       static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
        {                                                                                                                                       \
                if (dret) *dret = 0;                                                                                    \
-               str->l = 0;                                                                                                             \
+               str->l = append? str->l : 0;                                                                    \
                if (ks->begin >= ks->end && ks->is_eof) return -1;                              \
                for (;;) {                                                                                                              \
                        int i;                                                                                                          \
@@ -132,13 +126,15 @@ typedef struct __kstring_t {
                                break;                                                                                                  \
                        }                                                                                                                       \
                }                                                                                                                               \
-               if (str->l == 0) {                                                                                              \
+               if (str->s == 0) {                                                                                              \
                        str->m = 1;                                                                                                     \
                        str->s = (char*)calloc(1, 1);                                                           \
                }                                                                                                                               \
                str->s[str->l] = '\0';                                                                                  \
                return str->l;                                                                                                  \
-       }
+       } \
+       static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+       { return ks_getuntil2(ks, delimiter, str, dret, 0); }
 
 #define KSTREAM_INIT(type_t, __read, __bufsize) \
        __KS_TYPE(type_t)                                                       \
@@ -171,44 +167,45 @@ typedef struct __kstring_t {
    -1   end-of-file
    -2   truncated quality string
  */
-#define __KSEQ_READ                                                                                                            \
-       static int kseq_read(kseq_t *seq)                                                                       \
-       {                                                                                                                                       \
-               int c;                                                                                                                  \
-               kstream_t *ks = seq->f;                                                                                 \
+#define __KSEQ_READ \
+       static int kseq_read(kseq_t *seq) \
+       { \
+               int c; \
+               kstream_t *ks = seq->f; \
                if (seq->last_char == 0) { /* then jump to the next header line */ \
-                       while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');        \
-                       if (c == -1) return -1; /* end of file */                                       \
-                       seq->last_char = c;                                                                                     \
-               } /* the first header char has been read */                                             \
-               seq->comment.l = seq->seq.l = seq->qual.l = 0;                                  \
-               if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;                  \
-               if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0);                 \
+                       while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+                       if (c == -1) return -1; /* end of file */ \
+                       seq->last_char = c; \
+               } /* else: the first header char has been read in the previous call */ \
+               seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+               if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+               if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); /* read FASTA/Q comment */ \
+               if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+                       seq->seq.m = 256; \
+                       seq->seq.s = (char*)malloc(seq->seq.m); \
+               } \
                while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
-                       if (isgraph(c)) { /* printable non-space character */           \
-                               if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
-                                       seq->seq.m = seq->seq.l + 2;                                            \
-                                       kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
-                                       seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
-                               }                                                                                                               \
-                               seq->seq.s[seq->seq.l++] = (char)c;                                             \
-                       }                                                                                                                       \
-               }                                                                                                                               \
+                       seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+                       ks_getuntil2(ks, '\n', &seq->seq, 0, 1); /* read the rest of the line */ \
+               } \
                if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
-               seq->seq.s[seq->seq.l] = 0;     /* null terminated string */            \
-               if (c != '+') return seq->seq.l; /* FASTA */                                    \
-               if (seq->qual.m < seq->seq.m) { /* allocate enough memory */    \
-                       seq->qual.m = seq->seq.m;                                                                       \
-                       seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);         \
-               }                                                                                                                               \
+               if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+                       seq->seq.m = seq->seq.l + 2; \
+                       kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+                       seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+               } \
+               seq->seq.s[seq->seq.l] = 0;     /* null terminated string */ \
+               if (c != '+') return seq->seq.l; /* FASTA */ \
+               if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
+                       seq->qual.m = seq->seq.m; \
+                       seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+               } \
                while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
-               if (c == -1) return -2; /* we should not stop here */                   \
-               while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)             \
-                       if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
-               seq->qual.s[seq->qual.l] = 0; /* null terminated string */              \
+               if (c == -1) return -2; /* error: no quality string */ \
+               while (ks_getuntil2(ks, '\n', &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
                seq->last_char = 0;     /* we have not come to the next header line */ \
-               if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
-               return seq->seq.l;                                                                                              \
+               if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+               return seq->seq.l; \
        }
 
 #define __KSEQ_TYPE(type_t)                                            \
@@ -219,7 +216,7 @@ typedef struct __kstring_t {
        } kseq_t;
 
 #define KSEQ_INIT(type_t, __read)                              \
-       KSTREAM_INIT(type_t, __read, 4096)                      \
+       KSTREAM_INIT(type_t, __read, 16384)                     \
        __KSEQ_TYPE(type_t)                                                     \
        __KSEQ_BASIC(type_t)                                            \
        __KSEQ_READ
index 6c25c782eb4e2ad784563991bd69922e885737cb..d2f8bd8ea56c74709f4f0d426787db3cd6c52dd9 100644 (file)
@@ -4,7 +4,7 @@ CFLAGS=         -g -Wall -O2 #-m64 #-arch ppc
 CXXFLAGS=      $(CFLAGS)
 DFLAGS=                -D_FILE_OFFSET_BITS=64
 OBJS=          
-PROG=          md5sum-lite md5fa maq2sam-short maq2sam-long wgsim
+PROG=          md5sum-lite md5fa maq2sam-short maq2sam-long wgsim seqtk
 INCLUDES=      -I..
 SUBDIRS=       .
 
@@ -27,11 +27,11 @@ lib-recur all-recur clean-recur cleanlocal-recur install-recur:
 
 lib:
 
-afs2:afs2.o
-               $(CC) $(CFLAGS) -o $@ afs2.o -lm -lz -L.. -lbam
+seqtk:seqtk.o
+               $(CC) $(CFLAGS) -o $@ seqtk.o -lm -lz
 
 wgsim:wgsim.o
-               $(CC) $(CFLAGS) -o $@ wgsim.o -lm
+               $(CC) $(CFLAGS) -o $@ wgsim.o -lm -lz
 
 md5fa:md5.o md5fa.o md5.h ../kseq.h
                $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz
@@ -51,8 +51,11 @@ maq2sam-long:maq2sam.c
 md5fa.o:md5.h md5fa.c
                $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c
 
-afs2.o:afs2.c ../bam.h
-               $(CC) $(CFLAGS) -c -I.. -o $@ afs2.c
+seqtk.o:seqtk.c ../khash.h ../kseq.h
+               $(CC) $(CFLAGS) -c -I.. -o $@ seqtk.c
+
+wgsim.o:wgsim.c ../kseq.h
+               $(CC) $(CFLAGS) -c -I.. -o $@ wgsim.c
 
 cleanlocal:
                rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a
diff --git a/misc/seqtk.c b/misc/seqtk.c
new file mode 100644 (file)
index 0000000..591ddff
--- /dev/null
@@ -0,0 +1,783 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <zlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <limits.h>
+
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+typedef struct {
+       int n, m;
+       uint64_t *a;
+} reglist_t;
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(reg, reglist_t)
+
+typedef kh_reg_t reghash_t;
+
+reghash_t *stk_reg_read(const char *fn)
+{
+       reghash_t *h = kh_init(reg);
+       gzFile fp;
+       kstream_t *ks;
+       int dret;
+       kstring_t *str;
+       // read the list
+       str = calloc(1, sizeof(kstring_t));
+       fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+       ks = ks_init(fp);
+       while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+               int beg = -1, end = -1;
+               reglist_t *p;
+               khint_t k = kh_get(reg, h, str->s);
+               if (k == kh_end(h)) {
+                       int ret;
+                       char *s = strdup(str->s);
+                       k = kh_put(reg, h, s, &ret);
+                       memset(&kh_val(h, k), 0, sizeof(reglist_t));
+               }
+               p = &kh_val(h, k);
+               if (dret != '\n') {
+                       if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+                               beg = atoi(str->s);
+                               if (dret != '\n') {
+                                       if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+                                               end = atoi(str->s);
+                                               if (end < 0) end = -1;
+                                       }
+                               }
+                       }
+               }
+               // skip the rest of the line
+               if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+               if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
+               if (beg < 0) beg = 0, end = INT_MAX;
+               if (p->n == p->m) {
+                       p->m = p->m? p->m<<1 : 4;
+                       p->a = realloc(p->a, p->m * 8);
+               }
+               p->a[p->n++] = (uint64_t)beg<<32 | end;
+       }
+       ks_destroy(ks);
+       gzclose(fp);
+       free(str->s); free(str);
+       return h;
+}
+
+void stk_reg_destroy(reghash_t *h)
+{
+       khint_t k;
+       for (k = 0; k < kh_end(h); ++k) {
+               if (kh_exist(h, k)) {
+                       free(kh_val(h, k).a);
+                       free((char*)kh_key(h, k));
+               }
+       }
+       kh_destroy(reg, h);
+}
+
+/* constant table */
+
+unsigned char seq_nt16_table[256] = {
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15 /*'-'*/,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+       15,15, 5, 6,  8,15, 7, 9,  0,10,15,15, 15,15,15,15,
+       15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+       15,15, 5, 6,  8,15, 7, 9,  0,10,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+};
+
+char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN";
+unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+int bitcnt_table[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+
+/* composition */
+int stk_comp(int argc, char *argv[])
+{
+       gzFile fp;
+       kseq_t *seq;
+       int l, c, upper_only = 0;
+       reghash_t *h = 0;
+       reglist_t dummy;
+       while ((c = getopt(argc, argv, "ur:")) >= 0) {
+               switch (c) {
+                       case 'u': upper_only = 1; break;
+                       case 'r': h = stk_reg_read(optarg); break;
+               }
+       }
+       if (argc == optind) {
+               fprintf(stderr, "Usage:  seqtk comp [-u] [-r in.bed] <in.fa>\n\n");
+               fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n");
+               return 1;
+       }
+       fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+       seq = kseq_init(fp);
+       dummy.n= dummy.m = 1; dummy.a = calloc(1, 8);
+       while ((l = kseq_read(seq)) >= 0) {
+               int i, k;
+               reglist_t *p = 0;
+               if (h) {
+                       khint_t k = kh_get(reg, h, seq->name.s);
+                       if (k != kh_end(h)) p = &kh_val(h, k);
+               } else {
+                       p = &dummy;
+                       dummy.a[0] = l;
+               }
+               for (k = 0; p && k < p->n; ++k) {
+                       int beg = p->a[k]>>32, end = p->a[k]&0xffffffff;
+                       int la, lb, lc, na, nb, nc, cnt[11];
+                       if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb];
+                       else la = 'a', lb = -1, lc = 0;
+                       na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb];
+                       memset(cnt, 0, 11 * sizeof(int));
+                       for (i = beg; i < end; ++i) {
+                               int is_CpG = 0, a, b, c;
+                               a = na; b = nb; c = nc;
+                               na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb];
+                               if (b == 2 || b == 10) { // C or Y
+                                       if (nb == 4 || nb == 5) is_CpG = 1;
+                               } else if (b == 4 || b == 5) { // G or R
+                                       if (lb == 2 || lb == 10) is_CpG = 1;
+                               }
+                               if (upper_only == 0 || isupper(a)) {
+                                       if (c > 1) ++cnt[c+2];
+                                       if (c == 1) ++cnt[seq_nt16to4_table[b]];
+                                       if (b == 10 || b == 5) ++cnt[9];
+                                       else if (c == 2) {
+                                               ++cnt[8];
+                                       }
+                                       if (is_CpG) {
+                                               ++cnt[7];
+                                               if (b == 10 || b == 5) ++cnt[10];
+                                       }
+                               }
+                               la = a; lb = b; lc = c;
+                       }
+                       if (h) printf("%s\t%d\t%d", seq->name.s, beg, end);
+                       else printf("%s\t%d", seq->name.s, l);
+                       for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]);
+                       putchar('\n');
+               }
+               fflush(stdout);
+       }
+       free(dummy.a);
+       kseq_destroy(seq);
+       gzclose(fp);
+       return 0;
+}
+
+int stk_randbase(int argc, char *argv[])
+{
+       gzFile fp;
+       kseq_t *seq;
+       int l;
+       if (argc == 1) {
+               fprintf(stderr, "Usage: seqtk randbase <in.fa>\n");
+               return 1;
+       }
+       fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r");
+       seq = kseq_init(fp);
+       while ((l = kseq_read(seq)) >= 0) {
+               int i;
+               printf(">%s", seq->name.s);
+               for (i = 0; i < l; ++i) {
+                       int c, b, a, j, k, m;
+                       b = seq->seq.s[i];
+                       c = seq_nt16_table[b];
+                       a = bitcnt_table[c];
+                       if (a == 2) {
+                               m = (drand48() < 0.5);
+                               for (j = k = 0; j < 4; ++j) {
+                                       if ((1<<j & c) == 0) continue;
+                                       if (k == m) break;
+                                       ++k;
+                               }
+                               seq->seq.s[i] = islower(b)? "acgt"[j] : "ACGT"[j];
+                       }
+                       if (i%60 == 0) putchar('\n');
+                       putchar(seq->seq.s[i]);
+               }
+               putchar('\n');
+       }
+       kseq_destroy(seq);
+       gzclose(fp);
+       return 0;
+}
+
+int stk_hety(int argc, char *argv[])
+{
+       gzFile fp;
+       kseq_t *seq;
+       int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0;
+       char *buf;
+       uint32_t cnt[3];
+       if (argc == 1) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, "Usage:   seqtk hety [options] <in.fa>\n\n");
+               fprintf(stderr, "Options: -w INT   window size [%d]\n", win_size);
+               fprintf(stderr, "         -t INT   # start positions in a window [%d]\n", n_start);
+               fprintf(stderr, "         -m       treat lowercases as masked\n");
+               fprintf(stderr, "\n");
+               return 1;
+       }
+       while ((c = getopt(argc, argv, "w:t:m")) >= 0) {
+               switch (c) {
+               case 'w': win_size = atoi(optarg); break;
+               case 't': n_start = atoi(optarg); break;
+               case 'm': is_lower_mask = 1; break;
+               }
+       }
+       fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+       seq = kseq_init(fp);
+       win_step = win_size / n_start;
+       buf = calloc(win_size, 1);
+       while ((l = kseq_read(seq)) >= 0) {
+               int x, i, y, z, next = 0;
+               cnt[0] = cnt[1] = cnt[2] = 0;
+               for (i = 0; i <= l; ++i) {
+                       if ((i >= win_size && i % win_step == 0) || i == l) {
+                               if (i == l && l >= win_size) {
+                                       for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]];
+                               }
+                               if (cnt[1] + cnt[2] > 0)
+                                       printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i,
+                                                  (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]);
+                               next = i;
+                       }
+                       if (i < l) {
+                               y = i % win_size;
+                               c = seq->seq.s[i];
+                               if (is_lower_mask && islower(c)) c = 'N';
+                               c = seq_nt16_table[c];
+                               x = bitcnt_table[c];
+                               if (i >= win_size) --cnt[(int)buf[y]];
+                               buf[y] = z = x > 2? 0 : x == 2? 2 : 1;
+                               ++cnt[z];
+                       }
+               }
+       }
+       free(buf);
+       kseq_destroy(seq);
+       gzclose(fp);
+       return 0;
+}
+
+/* fq2fa */
+int stk_fq2fa(int argc, char *argv[])
+{
+       gzFile fp;
+       kseq_t *seq;
+       char *buf;
+       int l, i, c, qual_thres = 0, linelen = 60;
+       while ((c = getopt(argc, argv, "q:l:")) >= 0) {
+               switch (c) {
+                       case 'q': qual_thres = atoi(optarg); break;
+                       case 'l': linelen = atoi(optarg); break;
+               }
+       }
+       if (argc == optind) {
+               fprintf(stderr, "Usage: seqtk fq2fa [-q qualThres=0] [-l lineLen=60] <in.fq>\n");
+               return 1;
+       }
+       buf = linelen > 0? malloc(linelen + 1) : 0;
+       fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+       seq = kseq_init(fp);
+       while ((l = kseq_read(seq)) >= 0) {
+               if (seq->qual.l && qual_thres > 0) {
+                       for (i = 0; i < l; ++i)
+                               if (seq->qual.s[i] - 33 < qual_thres)
+                                       seq->seq.s[i] = tolower(seq->seq.s[i]);
+               }
+               putchar('>');
+               if (seq->comment.l) {
+                       fputs(seq->name.s, stdout);
+                       putchar(' ');
+                       puts(seq->comment.s);
+               } else puts(seq->name.s);
+               if (buf) { // multi-line
+                       for (i = 0; i < l; i += linelen) {
+                               int x = i + linelen < l? linelen : l - i;
+                               memcpy(buf, seq->seq.s + i, x);
+                               buf[x] = 0;
+                               puts(buf);
+                       }
+               } else puts(seq->seq.s);
+       }
+       free(buf);
+       kseq_destroy(seq);
+       gzclose(fp);
+       return 0;
+}
+
+int stk_maskseq(int argc, char *argv[])
+{
+       khash_t(reg) *h = kh_init(reg);
+       gzFile fp;
+       kseq_t *seq;
+       int l, i, j, c, is_complement = 0, is_lower = 0;
+       khint_t k;
+       while ((c = getopt(argc, argv, "cl")) >= 0) {
+               switch (c) {
+               case 'c': is_complement = 1; break;
+               case 'l': is_lower = 1; break;
+               }
+       }
+       if (argc - optind < 2) {
+               fprintf(stderr, "Usage:   seqtk maskseq [-cl] <in.fa> <in.bed>\n\n");
+               fprintf(stderr, "Options: -c     mask the complement regions\n");
+               fprintf(stderr, "         -l     soft mask (to lower cases)\n");
+               return 1;
+       }
+       h = stk_reg_read(argv[optind+1]);
+       // maskseq
+       fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+       seq = kseq_init(fp);
+       while ((l = kseq_read(seq)) >= 0) {
+               k = kh_get(reg, h, seq->name.s);
+               if (k == kh_end(h)) { // not found in the hash table
+                       if (is_complement) {
+                               for (j = 0; j < l; ++j)
+                                       seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N';
+                       }
+               } else {
+                       reglist_t *p = &kh_val(h, k);
+                       if (!is_complement) {
+                               for (i = 0; i < p->n; ++i) {
+                                       int beg = p->a[i]>>32, end = p->a[i];
+                                       if (beg >= seq->seq.l) {
+                                               fprintf(stderr, "[maskseq] start position >= the sequence length.\n");
+                                               continue;
+                                       }
+                                       if (end >= seq->seq.l) end = seq->seq.l;
+                                       if (is_lower) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]);
+                                       else for (j = beg; j < end; ++j) seq->seq.s[j] = 'N';
+                               }
+                       } else {
+                               int8_t *mask = calloc(seq->seq.l, 1);
+                               for (i = 0; i < p->n; ++i) {
+                                       int beg = p->a[i]>>32, end = p->a[i];
+                                       if (end >= seq->seq.l) end = seq->seq.l;
+                                       for (j = beg; j < end; ++j) mask[j] = 1;
+                               }
+                               for (j = 0; j < l; ++j)
+                                       if (mask[j] == 0) seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N';
+                               free(mask);
+                       }
+               }
+               printf(">%s", seq->name.s);
+               for (j = 0; j < seq->seq.l; ++j) {
+                       if (j%60 == 0) putchar('\n');
+                       putchar(seq->seq.s[j]);
+               }
+               putchar('\n');
+       }
+       // free
+       kseq_destroy(seq);
+       gzclose(fp);
+       stk_reg_destroy(h);
+       return 0;
+}
+
+/* subseq */
+
+int stk_subseq(int argc, char *argv[])
+{
+       khash_t(reg) *h = kh_init(reg);
+       gzFile fp;
+       kseq_t *seq;
+       int l, i, j, c, is_tab = 0;
+       khint_t k;
+       while ((c = getopt(argc, argv, "t")) >= 0) {
+               switch (c) {
+               case 't': is_tab = 1; break;
+               }
+       }
+       if (optind + 2 > argc) {
+               fprintf(stderr, "Usage: seqtk subseq [-t] <in.fa> <in.bed>\n\n");
+               fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n");
+               return 1;
+       }
+       h = stk_reg_read(argv[optind+1]);
+       // subseq
+       fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+       seq = kseq_init(fp);
+       while ((l = kseq_read(seq)) >= 0) {
+               reglist_t *p;
+               k = kh_get(reg, h, seq->name.s);
+               if (k == kh_end(h)) continue;
+               p = &kh_val(h, k);
+               for (i = 0; i < p->n; ++i) {
+                       int beg = p->a[i]>>32, end = p->a[i];
+                       if (beg >= seq->seq.l) {
+                               fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l);
+                               continue;
+                       }
+                       if (end > seq->seq.l) end = seq->seq.l;
+                       if (is_tab == 0) {
+                               printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s);
+                               if (end == INT_MAX) {
+                                       if (beg) printf(":%d", beg+1);
+                               } else printf(":%d-%d", beg+1, end);
+                       } else printf("%s\t%d\t", seq->name.s, beg + 1);
+                       if (end > seq->seq.l) end = seq->seq.l;
+                       for (j = 0; j < end - beg; ++j) {
+                               if (is_tab == 0 && j % 60 == 0) putchar('\n');
+                               putchar(seq->seq.s[j + beg]);
+                       }
+                       putchar('\n');
+                       if (seq->qual.l != seq->seq.l || is_tab) continue;
+                       printf("+");
+                       for (j = 0; j < end - beg; ++j) {
+                               if (j % 60 == 0) putchar('\n');
+                               putchar(seq->qual.s[j + beg]);
+                       }
+                       putchar('\n');
+               }
+       }
+       // free
+       kseq_destroy(seq);
+       gzclose(fp);
+       stk_reg_destroy(h);
+       return 0;
+}
+
+/* mergefa */
+int stk_mergefa(int argc, char *argv[])
+{
+       gzFile fp[2];
+       kseq_t *seq[2];
+       int i, l, c, is_intersect = 0, is_haploid = 0, qual = 0, is_mask = 0;
+       while ((c = getopt(argc, argv, "himq:")) >= 0) {
+               switch (c) {
+                       case 'i': is_intersect = 1; break;
+                       case 'h': is_haploid = 1; break;
+                       case 'm': is_mask = 1; break;
+                       case 'q': qual = atoi(optarg); break;
+               }
+       }
+       if (is_mask && is_intersect) {
+               fprintf(stderr, "[%s] `-i' and `-h' cannot be applied at the same time.\n", __func__);
+               return 1;
+       }
+       if (optind + 2 > argc) {
+               fprintf(stderr, "\nUsage: seqtk mergefa [options] <in1.fa> <in2.fa>\n\n");
+               fprintf(stderr, "Options: -q INT   quality threshold [0]\n");
+               fprintf(stderr, "         -i       take intersection\n");
+               fprintf(stderr, "         -m       convert to lowercase when one of the input base is N.\n");
+               fprintf(stderr, "         -h       suppress hets in the input\n\n");
+               return 1;
+       }
+       for (i = 0; i < 2; ++i) {
+               fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r");
+               seq[i] = kseq_init(fp[i]);
+       }
+       while (kseq_read(seq[0]) >= 0) {
+               int min_l, c[2], is_upper;
+               kseq_read(seq[1]);
+               if (strcmp(seq[0]->name.s, seq[1]->name.s))
+                       fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s);
+               if (seq[0]->seq.l != seq[1]->seq.l)
+                       fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l);
+               min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l;
+               printf(">%s", seq[0]->name.s);
+               for (l = 0; l < min_l; ++l) {
+                       c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l];
+                       if (seq[0]->qual.l && seq[0]->qual.s[l] - 33 < qual) c[0] = tolower(c[0]);
+                       if (seq[1]->qual.l && seq[1]->qual.s[l] - 33 < qual) c[1] = tolower(c[1]);
+                       if (is_intersect) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0;
+                       else if (is_mask) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0;
+                       else is_upper = (isupper(c[0]) && isupper(c[1]))? 1 : 0;
+                       c[0] = seq_nt16_table[c[0]]; c[1] = seq_nt16_table[c[1]];
+                       if (c[0] == 0) c[0] = 15;
+                       if (c[1] == 0) c[1] = 15;
+                       if (is_haploid && (bitcnt_table[c[0]] > 1 || bitcnt_table[c[1]] > 1)) is_upper = 0;
+                       if (is_intersect) {
+                               c[0] = c[0] & c[1];
+                               if (c[0] == 0) is_upper = 0;
+                       } else if (is_mask) {
+                               if (c[0] == 15 || c[1] == 15) is_upper = 0;
+                               c[0] = c[0] & c[1];
+                               if (c[0] == 0) is_upper = 0;
+                       } else c[0] = c[0] | c[1];
+                       c[0] = seq_nt16_rev_table[c[0]];
+                       if (!is_upper) c[0] = tolower(c[0]);
+                       if (l%60 == 0) putchar('\n');
+                       putchar(c[0]);
+               }
+               putchar('\n');
+       }
+       return 0;
+}
+
+int stk_famask(int argc, char *argv[])
+{
+       gzFile fp[2];
+       kseq_t *seq[2];
+       int i, l;
+       if (argc < 3) {
+               fprintf(stderr, "Usage: seqtk famask <src.fa> <mask.fa>\n");
+               return 1;
+       }
+       for (i = 0; i < 2; ++i) {
+               fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r");
+               seq[i] = kseq_init(fp[i]);
+       }
+       while (kseq_read(seq[0]) >= 0) {
+               int min_l, c[2];
+               kseq_read(seq[1]);
+               if (strcmp(seq[0]->name.s, seq[1]->name.s))
+                       fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s);
+               if (seq[0]->seq.l != seq[1]->seq.l)
+                       fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l);
+               min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l;
+               printf(">%s", seq[0]->name.s);
+               for (l = 0; l < min_l; ++l) {
+                       c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l];
+                       if (c[1] == 'x') c[0] = tolower(c[0]);
+                       else if (c[1] != 'X') c[0] = c[1];
+                       if (l%60 == 0) putchar('\n');
+                       putchar(c[0]);
+               }
+               putchar('\n');
+       }
+       return 0;
+}
+
+int stk_mutfa(int argc, char *argv[])
+{
+       khash_t(reg) *h = kh_init(reg);
+       gzFile fp;
+       kseq_t *seq;
+       kstream_t *ks;
+       int l, i, dret;
+       kstring_t *str;
+       khint_t k;
+       if (argc < 3) {
+               fprintf(stderr, "Usage: seqtk mutfa <in.fa> <in.snp>\n\n");
+               fprintf(stderr, "Note: <in.snp> contains at least four columns per line which are:\n");
+               fprintf(stderr, "      'chr  1-based-pos  any  base-changed-to'.\n");
+               return 1;
+       }
+       // read the list
+       str = calloc(1, sizeof(kstring_t));
+       fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
+       ks = ks_init(fp);
+       while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+               char *s = strdup(str->s);
+               int beg = 0, ret;
+               reglist_t *p;
+               k = kh_get(reg, h, s);
+               if (k == kh_end(h)) {
+                       k = kh_put(reg, h, s, &ret);
+                       memset(&kh_val(h, k), 0, sizeof(reglist_t));
+               }
+               p = &kh_val(h, k);
+               if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col
+               ks_getuntil(ks, 0, str, &dret); // 3rd col
+               ks_getuntil(ks, 0, str, &dret); // 4th col
+               // skip the rest of the line
+               if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+               if (isalpha(str->s[0]) && str->l == 1) {
+                       if (p->n == p->m) {
+                               p->m = p->m? p->m<<1 : 4;
+                               p->a = realloc(p->a, p->m * 8);
+                       }
+                       p->a[p->n++] = (uint64_t)beg<<32 | str->s[0];
+               }
+       }
+       ks_destroy(ks);
+       gzclose(fp);
+       free(str->s); free(str);
+       // mutfa
+       fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+       seq = kseq_init(fp);
+       while ((l = kseq_read(seq)) >= 0) {
+               reglist_t *p;
+               k = kh_get(reg, h, seq->name.s);
+               if (k != kh_end(h)) {
+                       p = &kh_val(h, k);
+                       for (i = 0; i < p->n; ++i) {
+                               int beg = p->a[i]>>32;
+                               if (beg < seq->seq.l)
+                                       seq->seq.s[beg] = (int)p->a[i];
+                       }
+               }
+               printf(">%s", seq->name.s);
+               for (i = 0; i < l; ++i) {
+                       if (i%60 == 0) putchar('\n');
+                       putchar(seq->seq.s[i]);
+               }
+               putchar('\n');
+       }
+       // free
+       kseq_destroy(seq);
+       gzclose(fp);
+       for (k = 0; k < kh_end(h); ++k) {
+               if (kh_exist(h, k)) {
+                       free(kh_val(h, k).a);
+                       free((char*)kh_key(h, k));
+               }
+       }
+       kh_destroy(reg, h);
+       return 0;
+}
+
+int stk_listhet(int argc, char *argv[])
+{
+       gzFile fp;
+       kseq_t *seq;
+       int i, l;
+       if (argc == 1) {
+               fprintf(stderr, "Usage: seqtk listhet <in.fa>\n");
+               return 1;
+       }
+       fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r");
+       seq = kseq_init(fp);
+       while ((l = kseq_read(seq)) >= 0) {
+               for (i = 0; i < l; ++i) {
+                       int b = seq->seq.s[i];
+                       if (bitcnt_table[seq_nt16_table[b]] == 2)
+                               printf("%s\t%d\t%c\n", seq->name.s, i+1, b);
+               }
+       }
+       kseq_destroy(seq);
+       gzclose(fp);
+       return 0;
+}
+
+/* cutN */
+static int cutN_min_N_tract = 1000;
+static int cutN_nonN_penalty = 10;
+
+static int find_next_cut(const kseq_t *ks, int k, int *begin, int *end)
+{
+       int i, b, e;
+       while (k < ks->seq.l) {
+               if (seq_nt16_table[(int)ks->seq.s[k]] == 15) {
+                       int score, max;
+                       score = 0; e = max = -1;
+                       for (i = k; i < ks->seq.l && score >= 0; ++i) { /* forward */
+                               if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score;
+                               else score -= cutN_nonN_penalty;
+                               if (score > max) max = score, e = i;
+                       }
+                       score = 0; b = max = -1;
+                       for (i = e; i >= 0 && score >= 0; --i) { /* backward */
+                               if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score;
+                               else score -= cutN_nonN_penalty;
+                               if (score > max) max = score, b = i;
+                       }
+                       if (e + 1 - b >= cutN_min_N_tract) {
+                               *begin = b;
+                               *end = e + 1;
+                               return *end;
+                       }
+                       k = e + 1;
+               } else ++k;
+       }
+       return -1;
+}
+static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end)
+{
+       int i;
+       if (begin >= end) return; // FIXME: why may this happen? Understand it!
+       fprintf(fpout, ">%s:%d-%d", ks->name.s, begin+1, end);
+       for (i = begin; i < end && i < ks->seq.l; ++i) {
+               if ((i - begin)%60 == 0) fputc('\n', fpout);
+               fputc(ks->seq.s[i], fpout);
+       }
+       fputc('\n', fpout);
+}
+int stk_cutN(int argc, char *argv[])
+{
+       int c, l, gap_only = 0;
+       gzFile fp;
+       kseq_t *ks;
+       while ((c = getopt(argc, argv, "n:p:g")) >= 0) {
+               switch (c) {
+               case 'n': cutN_min_N_tract = atoi(optarg); break;
+               case 'p': cutN_nonN_penalty = atoi(optarg); break;
+               case 'g': gap_only = 1; break;
+               default: return 1;
+               }
+       }
+       if (argc == optind) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, "Usage:   seqtk cutN [options] <in.fa>\n\n");
+               fprintf(stderr, "Options: -n INT    min size of N tract [%d]\n", cutN_min_N_tract);
+               fprintf(stderr, "         -p INT    penalty for a non-N [%d]\n", cutN_nonN_penalty);
+               fprintf(stderr, "         -g        print gaps only, no sequence\n\n");
+               return 1;
+       }
+       fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+       ks = kseq_init(fp);
+       while ((l = kseq_read(ks)) >= 0) {
+               int k = 0, begin = 0, end = 0;
+               while (find_next_cut(ks, k, &begin, &end) >= 0) {
+                       if (begin != 0) {
+                               if (gap_only) printf("%s\t%d\t%d\n", ks->name.s, begin, end);
+                               else print_seq(stdout, ks, k, begin);
+                       }
+                       k = end;
+               }
+               if (!gap_only) print_seq(stdout, ks, k, l);
+       }
+       kseq_destroy(ks);
+       gzclose(fp);
+       return 0;
+}
+
+/* main function */
+static int usage()
+{
+       fprintf(stderr, "\n");
+       fprintf(stderr, "Usage:   seqtk <command> <arguments>\n\n");
+       fprintf(stderr, "Command: comp      get the nucleotide composite of FASTA/Q\n");
+       fprintf(stderr, "         hety      regional heterozygosity\n");
+       fprintf(stderr, "         fq2fa     convert FASTQ to FASTA\n");
+       fprintf(stderr, "         subseq    extract subsequences from FASTA/Q\n");
+       fprintf(stderr, "         maskseq   mask sequences\n");
+       fprintf(stderr, "         mutfa     point mutate FASTA at specified positions\n");
+       fprintf(stderr, "         mergefa   merge two FASTA/Q files\n");
+       fprintf(stderr, "         randbase  choose a random base from hets\n");
+       fprintf(stderr, "         cutN      cut sequence at long N\n");
+       fprintf(stderr, "         listhet   extract the position of each het\n");
+       fprintf(stderr, "\n");
+       return 1;
+}
+
+int main(int argc, char *argv[])
+{
+       if (argc == 1) return usage();
+       if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1);
+       else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1);
+       else if (strcmp(argv[1], "fq2fa") == 0) stk_fq2fa(argc-1, argv+1);
+       else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1);
+       else if (strcmp(argv[1], "maskseq") == 0) stk_maskseq(argc-1, argv+1);
+       else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1);
+       else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1);
+       else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1);
+       else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1);
+       else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1);
+       else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1);
+       else {
+               fprintf(stderr, "[main] unrecognized commad '%s'. Abort!\n", argv[1]);
+               return 1;
+       }
+       return 0;
+}
index 7b5f095d910b06f3242419066ac8879b7cdb7454..b9c513c0540bc5120c0e1294ed62f91890bad000 100644 (file)
@@ -1,6 +1,7 @@
 /* The MIT License
 
    Copyright (c) 2008 Genome Research Ltd (GRL).
+                 2011 Heng Li <lh3@live.co.uk>
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
    SOFTWARE.
 */
 
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
-
 /* This program is separated from maq's read simulator with Colin
- * Hercus' modification to allow longer indels. Colin is the chief
- * developer of novoalign. */
+ * Hercus' modification to allow longer indels. */
 
 #include <stdlib.h>
 #include <math.h>
 #include <stdint.h>
 #include <ctype.h>
 #include <string.h>
+#include <zlib.h>
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
 
-#define PACKAGE_VERSION "0.2.3"
+#define PACKAGE_VERSION "0.3.0"
 
 const uint8_t nst_nt4_table[256] = {
        4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
@@ -60,8 +61,6 @@ const uint8_t nst_nt4_table[256] = {
        4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
 };
 
-const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
-
 /* Simple normal random number generator, copied from genran.c */
 
 double ran_normal()
@@ -85,78 +84,6 @@ double ran_normal()
        }
 }
 
-/* FASTA parser, copied from seq.c */
-
-typedef struct {
-       int l, m; /* length and maximum buffer size */
-       unsigned char *s; /* sequence */
-} seq_t;
-
-#define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0
-
-static int SEQ_BLOCK_SIZE = 512;
-
-void seq_set_block_size(int size)
-{
-       SEQ_BLOCK_SIZE = size;
-}
-
-int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment)
-{
-       int c, l, max;
-       char *p;
-       
-       c = 0;
-       while (!feof(fp) && fgetc(fp) != '>');
-       if (feof(fp)) return -1;
-       p = locus;
-       while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
-               if (c != '\r') *p++ = c;
-       *p = '\0';
-       if (comment) {
-               p = comment;
-               if (c != '\n') {
-                       while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t'));
-                       if (c != '\n') {
-                               *p++ = c;
-                               while (!feof(fp) && (c = fgetc(fp)) != '\n')
-                                       if (c != '\r') *p++ = c;
-                       }
-               }
-               *p = '\0';
-       } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
-       l = 0; max = seq->m;
-       while (!feof(fp) && (c = fgetc(fp)) != '>') {
-               if (isalpha(c) || c == '-' || c == '.') {
-                       if (l + 1 >= max) {
-                               max += SEQ_BLOCK_SIZE;
-                               seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
-                       }
-                       seq->s[l++] = (unsigned char)c;
-               }
-       }
-       if (c == '>') ungetc(c,fp);
-       seq->s[l] = 0;
-       seq->m = max; seq->l = l;
-       return l;
-}
-
-/* Error-checking open, copied from utils.c */
-
-#define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
-
-FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
-{
-       FILE *fp = 0;
-       if (strcmp(fn, "-") == 0)
-               return (strstr(mode, "r"))? stdin : stdout;
-       if ((fp = fopen(fn, mode)) == 0) {
-               fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn);
-               abort();
-       }
-       return fp;
-}
-
 /* wgsim */
 
 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
@@ -170,24 +97,23 @@ typedef struct {
 
 static double ERR_RATE = 0.02;
 static double MUT_RATE = 0.001;
-static double INDEL_FRAC = 0.1;
+static double INDEL_FRAC = 0.15;
 static double INDEL_EXTEND = 0.3;
-static int IS_SOLID = 0;
-static int SHOW_MM_INFO = 1;
+static double MAX_N_RATIO = 0.1;
 
-void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
+void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
 {
        int i, deleting = 0;
        mutseq_t *ret[2];
 
        ret[0] = hap1; ret[1] = hap2;
-       ret[0]->l = seq->l; ret[1]->l = seq->l;
-       ret[0]->m = seq->m; ret[1]->m = seq->m;
-       ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
-       ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
-       for (i = 0; i != seq->l; ++i) {
+       ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l;
+       ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m;
+       ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
+       ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
+       for (i = 0; i != ks->seq.l; ++i) {
                int c;
-               c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
+               c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]];
         if (deleting) {
             if (drand48() < INDEL_EXTEND) {
                 if (deleting & 1) ret[0]->s[i] |= DELETE;
@@ -230,12 +156,12 @@ void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
                }
        }
 }
-void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
+void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2)
 {
        int i;
-       for (i = 0; i != seq->l; ++i) {
+       for (i = 0; i != ks->seq.l; ++i) {
                int c[3];
-               c[0] = nst_nt4_table[(int)seq->s[i]];
+               c[0] = nst_nt4_table[(int)ks->seq.s[i]];
                c[1] = hap1->s[i]; c[2] = hap2->s[i];
                if (c[0] >= 4) continue;
                if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
@@ -248,8 +174,9 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq
                                } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
                                        printf("-\t");
                     int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
-                    while(n > 0) {
+                    while (n > 0) {
                         putchar("ACGTN"[ins & 0x3]);
+                                               ins >>= 2;
                         n--;
                     }
                     printf("\t-\n");
@@ -266,6 +193,7 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq
                     int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
                     while (n > 0) {
                         putchar("ACGTN"[ins & 0x3]);
+                                               ins >>= 2;
                         n--;
                     }
                     printf("\t+\n");
@@ -284,46 +212,51 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq
        }
 }
 
-void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
+void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
 {
-       seq_t seq;
+       kseq_t *ks;
     mutseq_t rseq[2];
+       gzFile fp_fa;
        uint64_t tot_len, ii;
        int i, l, n_ref;
-       char name[256], *qstr;
-       int size[2], Q;
+       char *qstr;
+       int size[2], Q, max_size;
        uint8_t *tmp_seq[2];
     mut_t *target;
 
-       INIT_SEQ(seq);
-       srand48(time(0));
-       seq_set_block_size(0x1000000);
        l = size_l > size_r? size_l : size_r;
        qstr = (char*)calloc(l+1, 1);
        tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
        tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
        size[0] = size_l; size[1] = size_r;
+       max_size = size_l > size_r? size_l : size_r;
 
        Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
 
+       fp_fa = gzopen(fn, "r");
+       ks = kseq_init(fp_fa);
        tot_len = n_ref = 0;
-       while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
+       fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__);
+       while ((l = kseq_read(ks)) >= 0) {
                tot_len += l;
                ++n_ref;
        }
-       fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len);
-       rewind(fp_fa);
+       fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len);
+       kseq_destroy(ks);
+       gzclose(fp_fa);
 
-       while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
+       fp_fa = gzopen(fn, "r");
+       ks = kseq_init(fp_fa);
+       while ((l = kseq_read(ks)) >= 0) {
                uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
                if (l < dist + 3 * std_dev) {
-                       fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev);
+                       fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev);
                        continue;
                }
 
                // generate mutations and print them out
-               maq_mut_diref(&seq, is_hap, rseq, rseq+1);
-               maq_print_mutref(name, &seq, rseq, rseq+1);
+               wgsim_mut_diref(ks, is_hap, rseq, rseq+1);
+               wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1);
 
                for (ii = 0; ii != n_pairs; ++ii) { // the core loop
                        double ran;
@@ -335,8 +268,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N,
                                ran = ran_normal();
                                ran = ran * std_dev + dist;
                                d = (int)(ran + 0.5);
+                               d = d > max_size? d : max_size;
                                pos = (int)((l - d + 1) * drand48());
-                       } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l);
+                       } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l);
 
                        // flip or not
                        if (drand48() < 0.5) {
@@ -353,7 +287,7 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N,
                        n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
 
 #define __gen_read(x, start, iter) do {                                                                        \
-                               for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) {    \
+                               for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) {        \
                                        int c = target[i], mut_type = c & mutmsk;                       \
                                        if (ext_coor[x] < 0) {                                                          \
                                                if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
@@ -374,33 +308,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N,
                                if (k != s[x]) ext_coor[x] = -10;                                               \
                        } while (0)
 
-                       if (!IS_SOLID) {
-                               __gen_read(0, pos, ++i);
-                               __gen_read(1, pos + d - 1, --i);
-                               for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
-                       } else {
-                               int c1, c2, c;
-                               ++s[0]; ++s[1]; // temporarily increase read length by 1
-                               if (is_flip) { // RR pair
-                                       __gen_read(0, pos + s[0], --i);
-                                       __gen_read(1, pos + d - 1, --i);
-                               } else { // FF pair
-                                       __gen_read(0, pos, ++i);
-                                       __gen_read(1, pos + d - 1 - s[1], ++i);
-                                       ++ext_coor[0]; ++ext_coor[1];
-                               }
-                               // change to color sequence: (0,1,2,3) -> (A,C,G,T)
-                               for (j = 0; j < 2; ++j) {
-                                       c1 = tmp_seq[j][0];
-                                       for (i = 1; i < s[j]; ++i) {
-                                               c2 = tmp_seq[j][i];
-                                               c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<<c1)|(1<<c2)];
-                                               tmp_seq[j][i-1] = c;
-                                               c1 = c2;
-                                       }
-                               }
-                               --s[0]; --s[1]; // change back
-                       }
+                       __gen_read(0, pos, ++i);
+                       __gen_read(1, pos + d - 1, --i);
+                       for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
                        if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
                                --ii;
                                continue;
@@ -408,30 +318,33 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N,
 
                        // generate sequencing errors
                        for (j = 0; j < 2; ++j) {
+                               int n_n = 0;
                                for (i = 0; i < s[j]; ++i) {
                                        int c = tmp_seq[j][i];
-                                       if (c >= 4) c = 4; // actually c should be never larger than 4 if everything is correct
-                                       else if (drand48() < ERR_RATE) {
-                                               c = (c + (int)(drand48() * 3.0 + 1)) & 3;
+                                       if (c >= 4) { // actually c should be never larger than 4 if everything is correct
+                                               c = 4;
+                                               ++n_n;
+                                       } else if (drand48() < ERR_RATE) {
+                                               // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors
+                                               c = (c + 1) & 3; // recurrent sequencing errors
                                                ++n_err[j];
                                        }
                                        tmp_seq[j][i] = c;
                                }
+                               if ((double)n_n / s[j] > MAX_N_RATIO) break;
+                       }
+                       if (j < 2) { // too many ambiguous bases on one of the reads
+                               --ii;
+                               continue;
                        }
 
                        // print
                        for (j = 0; j < 2; ++j) {
                                for (i = 0; i < s[j]; ++i) qstr[i] = Q;
                                qstr[i] = 0;
-                               if (SHOW_MM_INFO) {
-                                       fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
-                                                       n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
-                                                       (long long)ii, j==0? is_flip+1 : 2-is_flip);
-                               } else {
-                                       fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
-                                                       (long long)ii, j==0? is_flip+1 : 2-is_flip,
-                                                       n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]);
-                               }
+                               fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1,
+                                               n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
+                                               (long long)ii, j==0? is_flip+1 : 2-is_flip);
                                for (i = 0; i < s[j]; ++i)
                                        fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
                                fprintf(fpo[j], "\n+\n%s\n", qstr);
@@ -439,7 +352,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N,
                }
                free(rseq[0].s); free(rseq[1].s);
        }
-       free(seq.s); free(qstr);
+       kseq_destroy(ks);
+       gzclose(fp_fa);
+       free(qstr);
        free(tmp_seq[0]); free(tmp_seq[1]);
 }
 
@@ -459,11 +374,9 @@ static int simu_usage()
        fprintf(stderr, "         -r FLOAT      rate of mutations [%.4f]\n", MUT_RATE);
        fprintf(stderr, "         -R FLOAT      fraction of indels [%.2f]\n", INDEL_FRAC);
        fprintf(stderr, "         -X FLOAT      probability an indel is extended [%.2f]\n", INDEL_EXTEND);
-       fprintf(stderr, "         -c            generate reads in color space (SOLiD reads)\n");
-       fprintf(stderr, "         -C            show mismatch info in comment rather than read name\n");
+       fprintf(stderr, "         -S INT        seed for random generator [-1]\n");
        fprintf(stderr, "         -h            haplotype mode\n");
        fprintf(stderr, "\n");
-       fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n");
        return 1;
 }
 
@@ -471,11 +384,12 @@ int main(int argc, char *argv[])
 {
        int64_t N;
        int dist, std_dev, c, size_l, size_r, is_hap = 0;
-       FILE *fpout1, *fpout2, *fp_fa;
+       FILE *fpout1, *fpout2;
+       int seed = -1;
 
        N = 1000000; dist = 500; std_dev = 50;
        size_l = size_r = 70;
-       while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) {
+       while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:")) >= 0) {
                switch (c) {
                case 'd': dist = atoi(optarg); break;
                case 's': std_dev = atoi(optarg); break;
@@ -486,17 +400,20 @@ int main(int argc, char *argv[])
                case 'r': MUT_RATE = atof(optarg); break;
                case 'R': INDEL_FRAC = atof(optarg); break;
                case 'X': INDEL_EXTEND = atof(optarg); break;
-               case 'c': IS_SOLID = 1; break;
-               case 'C': SHOW_MM_INFO = 0; break;
+               case 'S': seed = atoi(optarg); break;
                case 'h': is_hap = 1; break;
                }
        }
        if (argc - optind < 3) return simu_usage();
-       fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r");
-       fpout1 = xopen(argv[optind+1], "w");
-       fpout2 = xopen(argv[optind+2], "w");
-       wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r);
+       fpout1 = fopen(argv[optind+1], "w");
+       fpout2 = fopen(argv[optind+2], "w");
+       if (!fpout1 || !fpout2) {
+               fprintf(stderr, "[wgsim] file open error\n");
+               return 1;
+       }
+       srand48(seed > 0? seed : time(0));
+       wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r);
 
-       fclose(fpout1); fclose(fpout2); fclose(fp_fa);
+       fclose(fpout1); fclose(fpout2);
        return 0;
 }
index 0821a613bb8b415410f01813f77f1b2465a1198c..efda4e8b93cef37a1a354724695b039bfbf0017f 100644 (file)
@@ -19,8 +19,10 @@ typedef struct {
 
 typedef khash_t(rg) *rghash_t;
 
+// FIXME: we'd better use no global variables...
 static rghash_t g_rghash = 0;
 static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0;
+static float g_subsam = -1;
 static char *g_library, *g_rg;
 static void *g_bed;
 
@@ -34,6 +36,11 @@ static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b)
                return 1;
        if (g_bed && b->core.tid >= 0 && !bed_overlap(g_bed, h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))))
                return 1;
+       if (g_subsam > 0.) {
+               int x = (int)(g_subsam + .499);
+               uint32_t k = __ac_X31_hash_string(bam1_qname(b)) + x;
+               if (k%1024 / 1024.0 >= g_subsam - x) return 1;
+       }
        if (g_rg || g_rghash) {
                uint8_t *s = bam_aux_get(b, "RG");
                if (s) {
@@ -111,8 +118,9 @@ int main_samview(int argc, char *argv[])
 
        /* parse command-line options */
        strcpy(in_mode, "r"); strcpy(out_mode, "w");
-       while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:R:L:")) >= 0) {
+       while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:R:L:s:")) >= 0) {
                switch (c) {
+               case 's': g_subsam = atof(optarg); break;
                case 'c': is_count = 1; break;
                case 'S': is_bamin = 0; break;
                case 'b': is_bamout = 1; break;
@@ -279,6 +287,7 @@ static int usage(int is_long_help)
        fprintf(stderr, "         -q INT   minimum mapping quality [0]\n");
        fprintf(stderr, "         -l STR   only output reads in library STR [null]\n");
        fprintf(stderr, "         -r STR   only output reads in read group STR [null]\n");
+       fprintf(stderr, "         -s FLOAT fraction of templates to subsample; integer part as seed [-1]\n");
        fprintf(stderr, "         -?       longer help\n");
        fprintf(stderr, "\n");
        if (is_long_help)
index 4e4b8a455ed445bd9676ed7e1ffe4abbe8f52048..830b9d1cb9f1aad1230c82dd8d24ad958645c7ff 100644 (file)
--- a/sample.c
+++ b/sample.c
@@ -52,7 +52,7 @@ static void add_pair(bam_sample_t *sm, khash_t(sm) *sm2id, const char *key, cons
 int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
 {
        const char *p = txt, *q, *r;
-       kstring_t buf;
+       kstring_t buf, first_sm;
        int n = 0;
        khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id;
        if (txt == 0) {
@@ -60,6 +60,7 @@ int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
                return 0;
        }
        memset(&buf, 0, sizeof(kstring_t));
+    memset(&first_sm, 0, sizeof(kstring_t));
        while ((q = strstr(p, "@RG")) != 0) {
                p = q + 3;
                r = q = 0;
@@ -73,12 +74,21 @@ int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
                        oq = *u; or = *v; *u = *v = '\0';
                        buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf);
                        add_pair(sm, sm2id, buf.s, r);
+            if ( !first_sm.s )
+                kputs(r,&first_sm); 
                        *u = oq; *v = or;
                } else break;
                p = q > r? q : r;
                ++n;
        }
        if (n == 0) add_pair(sm, sm2id, fn, fn);
+    // If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but
+    //  use the tag instead.
+    else if ( n==1 && first_sm.s )
+        add_pair(sm,sm2id,fn,first_sm.s);
+    if ( first_sm.s )
+        free(first_sm.s);
+
 //     add_pair(sm, sm2id, fn, fn);
        free(buf.s);
        return 0;