From 0f906dafb2ad22371a753557562ef95c3034480d Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 6 Sep 2011 15:48:39 +0900 Subject: [PATCH 1/1] Imported Upstream version 0.1.18 --- Makefile.mingw | 23 +- NEWS | 27 ++ bam.c | 4 +- bam.h | 21 +- bam2bcf.c | 91 +++++ bam2bcf.h | 2 + bam2bcf_indel.c | 4 +- bam_aux.c | 17 + bam_import.c | 15 +- bam_index.c | 2 +- bam_md.c | 113 ++++--- bam_pileup.c | 18 +- bcftools/bcf.h | 4 +- bcftools/bcfutils.c | 31 +- bcftools/call1.c | 12 +- bcftools/em.c | 9 +- bcftools/mut.c | 5 +- bedidx.c | 4 + kprobaln.c | 2 +- kseq.h | 89 +++-- misc/Makefile | 15 +- misc/seqtk.c | 783 ++++++++++++++++++++++++++++++++++++++++++++ misc/wgsim.c | 239 +++++--------- sam_view.c | 11 +- sample.c | 12 +- 25 files changed, 1244 insertions(+), 309 deletions(-) create mode 100644 misc/seqtk.c diff --git a/Makefile.mingw b/Makefile.mingw index 836360f..7a57ffc 100644 --- a/Makefile.mingw +++ b/Makefile.mingw @@ -4,14 +4,15 @@ CFLAGS= -g -Wall -O2 DFLAGS= -D_USE_KNETFILE -D_CURSES_LIB=2 KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ - bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \ - $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o -AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ + bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o \ + $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bedidx.o +AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ - bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o + bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ + cut_target.o phase.o bam_cat.o bam2depth.o BCFOBJS= bcftools/bcf.o bcftools/fet.o bcftools/bcf2qcall.o bcftools/bcfutils.o \ - bcftools/call1.o bcftools/index.o bcftools/kfunc.o bcftools/ld.o \ - bcftools/prob1.o bcftools/vcf.o + bcftools/call1.o bcftools/index.o bcftools/kfunc.o bcftools/em.o \ + bcftools/kmin.o bcftools/prob1.o bcftools/vcf.o bcftools/mut.o PROG= samtools.exe bcftools.exe INCLUDES= -I. -Iwin32 SUBDIRS= . @@ -35,22 +36,20 @@ libbam.a:$(LOBJS) samtools.exe:$(AOBJS) libbam.a $(BCFOBJS) $(CC) $(CFLAGS) -o $@ $(AOBJS) $(BCFOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32 -bcftools.exe:$(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o - $(CC) $(CFLAGS) -o $@ $(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o -lm -Lwin32 -lz -lws2_32 +bcftools.exe:$(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o + $(CC) $(CFLAGS) -o $@ $(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o -lm -Lwin32 -lz -lws2_32 razip.o:razf.h bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h sam.o:sam.h bam.h bam_import.o:bam.h kseq.h khash.h razf.h bam_pileup.o:bam.h razf.h ksort.h -bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h bcftools/bcf.h bam2bcf.h +bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h bam_lpileup.o:bam.h ksort.h -bam_tview.o:bam.h faidx.h bam_maqcns.h -bam_maqcns.o:bam.h ksort.h bam_maqcns.h kaln.h +bam_tview.o:bam.h faidx.h bam_sort.o:bam.h ksort.h razf.h bam_md.o:bam.h faidx.h -glf.o:glf.h sam_header.o:sam_header.h khash.h bcf.o:bcftools/bcf.h bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h diff --git a/NEWS b/NEWS index 1aa2359..41a6cc8 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,30 @@ +Beta Release 0.1.18 (2 September, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in samtools: + + * Support the new =/X CIGAR operators (by Peter Cock). + + * Allow to subsample BAM while keeping the pairing intact (view -s). + + * Implemented variant distance bias as a new filter (by Petr Danecek). + + * Bugfix: huge memory usage during indexing + + * Bugfix: use of uninitialized variable in mpileup (rare) + + * Bugfix: wrong BAQ probability (rare) + +Notable changes in bcftools: + + * Support indel in the contrast caller. + + * Bugfix: LRT2=nan in rare cases + +(0.1.18: 2 September 2011, r982:295) + + + Beta Release 0.1.17 (6 July, 2011) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bam.c b/bam.c index 5fac17d..0055e84 100644 --- a/bam.c +++ b/bam.c @@ -32,7 +32,7 @@ int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar) int32_t l = 0; for (k = 0; k < c->n_cigar; ++k) { int op = cigar[k] & BAM_CIGAR_MASK; - if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP) + if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) l += cigar[k] >> BAM_CIGAR_SHIFT; } return l; @@ -268,7 +268,7 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) else { for (i = 0; i < c->n_cigar; ++i) { kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); - kputc("MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); + kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); } } kputc('\t', &str); diff --git a/bam.h b/bam.h index 246b020..346c750 100644 --- a/bam.h +++ b/bam.h @@ -40,7 +40,7 @@ @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.17 (r973:277)" +#define BAM_VERSION "0.1.18 (r982:295)" #include #include @@ -134,20 +134,25 @@ typedef struct { /* CIGAR operations. */ -/*! @abstract CIGAR: match */ +/*! @abstract CIGAR: M = match or mismatch*/ #define BAM_CMATCH 0 -/*! @abstract CIGAR: insertion to the reference */ +/*! @abstract CIGAR: I = insertion to the reference */ #define BAM_CINS 1 -/*! @abstract CIGAR: deletion from the reference */ +/*! @abstract CIGAR: D = deletion from the reference */ #define BAM_CDEL 2 -/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */ +/*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */ #define BAM_CREF_SKIP 3 -/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */ +/*! @abstract CIGAR: S = clip on the read with clipped sequence + present in qseq */ #define BAM_CSOFT_CLIP 4 -/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */ +/*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */ #define BAM_CHARD_CLIP 5 -/*! @abstract CIGAR: padding */ +/*! @abstract CIGAR: P = padding */ #define BAM_CPAD 6 +/*! @abstract CIGAR: equals = match */ +#define BAM_CEQUAL 7 +/*! @abstract CIGAR: X = mismatch */ +#define BAM_CDIFF 8 /*! @typedef @abstract Structure for core alignment information. diff --git a/bam2bcf.c b/bam2bcf.c index f263fad..dec3305 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -39,6 +39,7 @@ void bcf_call_destroy(bcf_callaux_t *bca) * negative if we are looking at an indel. */ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r) { + static int *var_pos = NULL, nvar_pos = 0; int i, n, ref4, is_indel, ori_depth = 0; memset(r, 0, sizeof(bcf_callret1_t)); if (ref_base >= 0) { @@ -94,9 +95,92 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t r->depth = n; r->ori_depth = ori_depth; // glfgen errmod_cal(bca->e, n, 5, bca->bases, r->p); + + // Calculate the Variant Distance Bias (make it optional?) + if ( nvar_pos < _n ) { + nvar_pos = _n; + var_pos = realloc(var_pos,sizeof(int)*nvar_pos); + } + int alt_dp=0, read_len=0; + for (i=0; i<_n; i++) { + const bam_pileup1_t *p = pl + i; + if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) + continue; + + var_pos[alt_dp] = p->qpos; + if ( (bam1_cigar(p->b)[0]&BAM_CIGAR_MASK)==4 ) + var_pos[alt_dp] -= bam1_cigar(p->b)[0]>>BAM_CIGAR_SHIFT; + + alt_dp++; + read_len += p->b->core.l_qseq; + } + float mvd=0; + int j; + n=0; + for (i=0; imvd[0] = n ? mvd/n : 0; + r->mvd[1] = alt_dp; + r->mvd[2] = alt_dp ? read_len/alt_dp : 0; + return r->depth; } + +void calc_vdb(int n, const bcf_callret1_t *calls, bcf_call_t *call) +{ + // Variant distance bias. Samples merged by means of DP-weighted average. + + float weight=0, tot_prob=0; + + int i; + for (i=0; i2*mu ? 0 : sin(mvd*3.14/2/mu) / (4*mu/3.14); + } + else + { + // Scaled gaussian curve, crude approximation, but behaves well. Using fixed depth for bigger depths. + if ( dp>5 ) + dp = 5; + float sigma2 = (read_len/1.9/(dp+1)) * (read_len/1.9/(dp+1)); + float norm = 1.125*sqrt(2*3.14*sigma2); + float mu = read_len/2.9; + if ( mvd < mu ) + prob = exp(-(mvd-mu)*(mvd-mu)/2/sigma2)/norm; + else + prob = exp(-(mvd-mu)*(mvd-mu)/3.125/sigma2)/norm; + } + + //fprintf(stderr,"dp=%d mvd=%d read_len=%d -> prob=%f\n", dp,mvd,read_len,prob); + tot_prob += prob*dp; + weight += dp; + } + tot_prob = weight ? tot_prob/weight : 1; + //fprintf(stderr,"prob=%f\n", tot_prob); + call->vdb = tot_prob; +} + int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call) { int ref4, i, j, qsum[4]; @@ -170,6 +254,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, call->ori_depth += calls[i].ori_depth; for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; } + + calc_vdb(n, calls, call); + return 0; } @@ -223,6 +310,10 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc if (i) kputc(',', &s); kputw(bc->anno[i], &s); } + if ( bc->vdb!=1 ) + { + ksprintf(&s, ";VDB=%.4f", bc->vdb); + } kputc('\0', &s); // FMT kputs("PL", &s); diff --git a/bam2bcf.h b/bam2bcf.h index 9585672..4af080c 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -26,6 +26,7 @@ typedef struct { int depth, ori_depth, qsum[4]; int anno[16]; float p[25]; + int mvd[3]; // mean variant distance, number of variant reads, average read length } bcf_callret1_t; typedef struct { @@ -33,6 +34,7 @@ typedef struct { int n, n_alleles, shift, ori_ref, unseen; int anno[16], depth, ori_depth; uint8_t *PL; + float vdb; // variant distance bias } bcf_call_t; #ifdef __cplusplus diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 2fdee9f..5142b3e 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -66,7 +66,7 @@ static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, for (k = 0; k < c->n_cigar; ++k) { int op = cigar[k] & BAM_CIGAR_MASK; int l = cigar[k] >> BAM_CIGAR_SHIFT; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { if (c->pos > tpos) return y; if (x + l > tpos) { *_tpos = tpos; @@ -222,7 +222,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla for (k = 0; k < b->core.n_cigar; ++k) { int op = cigar[k]&0xf; int j, l = cigar[k]>>4; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) if (x + j >= left && x + j < right) cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; diff --git a/bam_aux.c b/bam_aux.c index 2247bdf..28b22e3 100644 --- a/bam_aux.c +++ b/bam_aux.c @@ -59,6 +59,23 @@ int bam_aux_del(bam1_t *b, uint8_t *s) return 0; } +int bam_aux_drop_other(bam1_t *b, uint8_t *s) +{ + if (s) { + uint8_t *p, *aux; + aux = bam1_aux(b); + p = s - 2; + __skip_tag(s); + memmove(aux, p, s - p); + b->data_len -= b->l_aux - (s - p); + b->l_aux = s - p; + } else { + b->data_len -= b->l_aux; + b->l_aux = 0; + } + return 0; +} + void bam_init_header_hash(bam_header_t *header) { if (header->hash == 0) { diff --git a/bam_import.c b/bam_import.c index 29516c9..5518a9c 100644 --- a/bam_import.c +++ b/bam_import.c @@ -14,7 +14,7 @@ #include "kseq.h" #include "khash.h" -KSTREAM_INIT(gzFile, gzread, 8192) +KSTREAM_INIT(gzFile, gzread, 16384) KHASH_MAP_INIT_STR(ref, uint64_t) void bam_init_header_hash(bam_header_t *header); @@ -292,20 +292,22 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) z += str->l + 1; if (str->s[0] != '*') { for (s = str->s; *s; ++s) { - if (isalpha(*s)) ++c->n_cigar; + if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar; else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character"); } b->data = alloc_data(b, doff + c->n_cigar * 4); for (i = 0, s = str->s; i != c->n_cigar; ++i) { x = strtol(s, &t, 10); op = toupper(*t); - if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; + if (op == 'M') op = BAM_CMATCH; else if (op == 'I') op = BAM_CINS; else if (op == 'D') op = BAM_CDEL; else if (op == 'N') op = BAM_CREF_SKIP; else if (op == 'S') op = BAM_CSOFT_CLIP; else if (op == 'H') op = BAM_CHARD_CLIP; else if (op == 'P') op = BAM_CPAD; + else if (op == '=') op = BAM_CEQUAL; + else if (op == 'X') op = BAM_CDIFF; else parse_error(fp->n_lines, "invalid CIGAR operation"); s = t + 1; bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; @@ -337,8 +339,11 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) z += str->l + 1; if (strcmp(str->s, "*")) { c->l_qseq = strlen(str->s); - if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) - parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); + if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) { + fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n", + (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b))); + parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); + } p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff; memset(p, 0, (c->l_qseq+1)/2); for (i = 0; i < c->l_qseq; ++i) diff --git a/bam_index.c b/bam_index.c index 66d8eb8..9610a26 100644 --- a/bam_index.c +++ b/bam_index.c @@ -188,7 +188,7 @@ bam_index_t *bam_index_core(bamFile fp) bam1_qname(b), last_coor, c->pos, c->tid+1); return NULL; } - if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off); + if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off); if (c->bin != last_bin) { // then possibly write the binning index if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record insert_offset(idx->index[save_tid], save_bin, save_off, last_off); diff --git a/bam_md.c b/bam_md.c index 20b1913..d42aa8f 100644 --- a/bam_md.c +++ b/bam_md.c @@ -9,40 +9,46 @@ #include "kaln.h" #include "kprobaln.h" +#define USE_EQUAL 1 +#define DROP_TAG 2 +#define BIN_QUAL 4 +#define UPDATE_NM 8 +#define UPDATE_MD 16 +#define HASH_QNM 32 + char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; -void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) +int bam_aux_drop_other(bam1_t *b, uint8_t *s); + +void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm) { uint8_t *seq = bam1_seq(b); uint32_t *cigar = bam1_cigar(b); bam1_core_t *c = &b->core; int i, x, y, u = 0; kstring_t *str; - uint8_t *old_md, *old_nm; int32_t old_nm_i = -1, nm = 0; str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; if (ref[x+j] == 0) break; // out of boundary if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match - if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f; + if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; ++u; } else { - ksprintf(str, "%d", u); - kputc(ref[x+j], str); + kputw(u, str); kputc(ref[x+j], str); u = 0; ++nm; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL) { - ksprintf(str, "%d", u); - kputc('^', str); + kputw(u, str); kputc('^', str); for (j = 0; j < l; ++j) { if (ref[x+j] == 0) break; kputc(ref[x+j], str); @@ -57,12 +63,12 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) x += l; } } - ksprintf(str, "%d", u); + kputw(u, str); // apply max_nm if (max_nm > 0 && nm >= max_nm) { for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; @@ -79,38 +85,54 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) } } // update NM - old_nm = bam_aux_get(b, "NM"); - if (c->flag & BAM_FUNMAP) return; - if (old_nm) old_nm_i = bam_aux2i(old_nm); - if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); - else if (nm != old_nm_i) { - fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); - bam_aux_del(b, old_nm); - bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + if (flag & UPDATE_NM) { + uint8_t *old_nm = bam_aux_get(b, "NM"); + if (c->flag & BAM_FUNMAP) return; + if (old_nm) old_nm_i = bam_aux2i(old_nm); + if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + else if (nm != old_nm_i) { + fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); + bam_aux_del(b, old_nm); + bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + } } // update MD - old_md = bam_aux_get(b, "MD"); - if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); - else { - int is_diff = 0; - if (strlen((char*)old_md+1) == str->l) { - for (i = 0; i < str->l; ++i) - if (toupper(old_md[i+1]) != toupper(str->s[i])) - break; - if (i < str->l) is_diff = 1; - } else is_diff = 1; - if (is_diff) { - fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s); - bam_aux_del(b, old_md); - bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + if (flag & UPDATE_MD) { + uint8_t *old_md = bam_aux_get(b, "MD"); + if (c->flag & BAM_FUNMAP) return; + if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + else { + int is_diff = 0; + if (strlen((char*)old_md+1) == str->l) { + for (i = 0; i < str->l; ++i) + if (toupper(old_md[i+1]) != toupper(str->s[i])) + break; + if (i < str->l) is_diff = 1; + } else is_diff = 1; + if (is_diff) { + fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s); + bam_aux_del(b, old_md); + bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + } } } + // drop all tags but RG + if (flag&DROP_TAG) { + uint8_t *q = bam_aux_get(b, "RG"); + bam_aux_drop_other(b, q); + } + // reduce the resolution of base quality + if (flag&BIN_QUAL) { + uint8_t *qual = bam1_qual(b); + for (i = 0; i < b->core.l_qseq; ++i) + if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7; + } free(str->s); free(str); } -void bam_fillmd1(bam1_t *b, char *ref, int is_equal) +void bam_fillmd1(bam1_t *b, char *ref, int flag) { - bam_fillmd1_core(b, ref, is_equal, 0); + bam_fillmd1_core(b, ref, flag, 0); } int bam_cap_mapQ(bam1_t *b, char *ref, int thres) @@ -124,7 +146,7 @@ int bam_cap_mapQ(bam1_t *b, char *ref, int thres) mm = q = len = clip_l = clip_q = 0; for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; @@ -197,7 +219,7 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int flag) for (k = 0; k < c->n_cigar; ++k) { int op, l; op = cigar[k]&0xf; l = cigar[k]>>4; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { if (yb < 0) yb = y; if (xb < 0) xb = x; ye = y + l; xe = x + l; @@ -233,7 +255,7 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int flag) if (!extend_baq) { // in this block, bq[] is capped by base quality qual[] for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { int op = cigar[k]&0xf, l = cigar[k]>>4; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (i = y; i < y + l; ++i) { if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0; else bq[i] = bq[i] < q[i]? bq[i] : q[i]; @@ -248,7 +270,7 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int flag) left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1); for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { int op = cigar[k]&0xf, l = cigar[k]>>4; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (i = y; i < y + l; ++i) bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i]; for (left[y] = bq[y], i = y + 1; i < y + l; ++i) @@ -280,19 +302,24 @@ int bam_prob_realn(bam1_t *b, const char *ref) int bam_fillmd(int argc, char *argv[]) { - int c, is_equal, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag; + int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag; samfile_t *fp, *fpout = 0; faidx_t *fai; char *ref = 0, mode_w[8], mode_r[8]; bam1_t *b; - is_equal = is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0; + flt_flag = UPDATE_NM | UPDATE_MD; + is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0; mode_w[0] = mode_r[0] = 0; strcpy(mode_r, "r"); strcpy(mode_w, "w"); - while ((c = getopt(argc, argv, "EreubSC:n:A")) >= 0) { + while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) { switch (c) { case 'r': is_realn = 1; break; - case 'e': is_equal = 1; break; + case 'e': flt_flag |= USE_EQUAL; break; + case 'd': flt_flag |= DROP_TAG; break; + case 'q': flt_flag |= BIN_QUAL; break; + case 'h': flt_flag |= HASH_QNM; break; + case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break; case 'b': is_bam_out = 1; break; case 'u': is_uncompressed = is_bam_out = 1; break; case 'S': is_sam_in = 1; break; @@ -344,7 +371,7 @@ int bam_fillmd(int argc, char *argv[]) int q = bam_cap_mapQ(b, ref, capQ); if (b->core.qual > q) b->core.qual = q; } - if (ref) bam_fillmd1_core(b, ref, is_equal, max_nm); + if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm); } samwrite(fpout, b); } diff --git a/bam_pileup.c b/bam_pileup.c index 3e26f74..57434e0 100644 --- a/bam_pileup.c +++ b/bam_pileup.c @@ -78,12 +78,12 @@ static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s) if (s->k == -1) { // never processed is_head = 1; if (c->n_cigar == 1) { // just one operation, save a loop - if (_cop(cigar[0]) == BAM_CMATCH) s->k = 0, s->x = c->pos, s->y = 0; + if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0; } else { // find the first match or deletion for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { int op = _cop(cigar[k]); int l = _cln(cigar[k]); - if (op == BAM_CMATCH || op == BAM_CDEL) break; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break; else if (op == BAM_CREF_SKIP) s->x += l; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; } @@ -95,16 +95,16 @@ static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s) if (pos - s->x >= l) { // jump to the next operation assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case op = _cop(cigar[s->k+1]); - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) { // jump to the next without a loop - if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop + if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; s->x += l; ++s->k; - } else { // find the next M/D/N - if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l; + } else { // find the next M/D/N/=/X + if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; s->x += l; for (k = s->k + 1; k < c->n_cigar; ++k) { op = _cop(cigar[k]), l = _cln(cigar[k]); - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) break; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; } s->k = k; @@ -126,12 +126,12 @@ static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s) for (k = s->k + 2; k < c->n_cigar; ++k) { op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); if (op2 == BAM_CINS) l3 += l2; - else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP) break; + else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break; } if (l3 > 0) p->indel = l3; } } - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { p->qpos = s->y + (pos - s->x); } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!! diff --git a/bcftools/bcf.h b/bcftools/bcf.h index aeae6ca..822ae5c 100644 --- a/bcftools/bcf.h +++ b/bcftools/bcf.h @@ -28,7 +28,7 @@ #ifndef BCF_H #define BCF_H -#define BCF_VERSION "0.1.17 (r973:277)" +#define BCF_VERSION "0.1.17-dev (r973:277)" #include #include @@ -154,6 +154,8 @@ extern "C" { int bcf_fix_pl(bcf1_t *b); // convert PL to GLF-like 10-likelihood GL int bcf_gl10(const bcf1_t *b, uint8_t *gl); + // convert up to 4 INDEL alleles to GLF-like 10-likelihood GL + int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl); // string hash table void *bcf_build_refhash(bcf_hdr_t *h); diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c index e11cfce..0eab4c1 100644 --- a/bcftools/bcfutils.c +++ b/bcftools/bcfutils.c @@ -5,6 +5,11 @@ #include "khash.h" KHASH_MAP_INIT_STR(str2id, int) +#ifdef _WIN32 +#define srand48(x) srand(x) +#define drand48() ((double)rand() / RAND_MAX) +#endif + // FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated... void *bcf_build_refhash(bcf_hdr_t *h) { @@ -349,8 +354,6 @@ int bcf_gl10(const bcf1_t *b, uint8_t *gl) for (i = 0; i < b->n_smpl; ++i) { const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual uint8_t *g = gl + 10 * i; - for (j = 0; j < PL->len; ++j) - if (p[j]) break; for (k = j = 0; k < 4; ++k) { for (l = k; l < 4; ++l) { int t, x = map[k], y = map[l]; @@ -361,3 +364,27 @@ int bcf_gl10(const bcf1_t *b, uint8_t *gl) } return 0; } + +int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl) +{ + int k, l, j, i; + const bcf_ginfo_t *PL; + if (b->alt[0] == 0) return -1; // no alternate allele + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + PL = b->gi + i; + for (i = 0; i < b->n_smpl; ++i) { + const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual + uint8_t *g = gl + 10 * i; + for (k = j = 0; k < 4; ++k) { + for (l = k; l < 4; ++l) { + int t, x = k, y = l; + if (x > y) t = x, x = y, y = t; // make sure x is the smaller + x = y * (y+1) / 2 + x; + g[j++] = x < PL->len? p[x] : 255; + } + } + } + return 0; +} diff --git a/bcftools/call1.c b/bcftools/call1.c index b2d7d4a..3cc4649 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -8,6 +8,11 @@ #include "kstring.h" #include "time.h" +#ifdef _WIN32 +#define srand48(x) srand(x) +#define lrand48() rand() +#endif + #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) @@ -126,7 +131,6 @@ static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]); if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]); if (em[8] >= 0) ksprintf(&s, ";LRT2=%.3g", em[8]); - //if (em[9] >= 0) ksprintf(&s, ";LRT1=%.3g", em[9]); } if (cons_llr > 0) { ksprintf(&s, ";CLR=%d", cons_llr); @@ -269,6 +273,8 @@ static void write_header(bcf_hdr_t *h) kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); + kputs("##FORMAT=\n", &str); h->l_txt = str.l + 1; h->txt = str.s; } @@ -490,7 +496,7 @@ int bcfview(int argc, char *argv[]) if (!(l > begin && end > b->pos)) continue; } ++n_processed; - if (vc.flag & VC_QCNT) { // summarize the difference + if ((vc.flag & VC_QCNT) && !is_indel) { // summarize the difference int x = bcf_min_diff(b); if (x > 255) x = 255; if (x >= 0) ++qcnt[x]; diff --git a/bcftools/em.c b/bcftools/em.c index 32b400f..b7dfe1a 100644 --- a/bcftools/em.c +++ b/bcftools/em.c @@ -168,7 +168,6 @@ static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3]) // x[5..6]: group1 freq, group2 freq // x[7]: 1-degree P-value // x[8]: 2-degree P-value -// x[9]: 1-degree P-value with freq estimated from genotypes int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]) { double *pdg; @@ -208,11 +207,13 @@ int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]) x[6] = freqml(x[0], n1, n, pdg); } if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value - double f[3], f3[3][3]; + double f[3], f3[3][3], tmp; f[0] = x[0]; f[1] = x[5]; f[2] = x[6]; for (i = 0; i < 3; ++i) f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i]; - x[7] = kf_gammaq(.5, log(lk_ratio_test(n, n1, pdg, f3))); + tmp = log(lk_ratio_test(n, n1, pdg, f3)); + if (tmp < 0) tmp = 0; + x[7] = kf_gammaq(.5, tmp); } if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value double g[3][3], tmp; @@ -222,8 +223,8 @@ int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]) for (i = 0; i < ITER_MAX; ++i) if (g3_iter(g[2], pdg, n1, n) < EPS) break; tmp = log(lk_ratio_test(n, n1, pdg, g)); + if (tmp < 0) tmp = 0; x[8] = kf_gammaq(1., tmp); - x[9] = kf_gammaq(.5, tmp); } // free free(pdg); diff --git a/bcftools/mut.c b/bcftools/mut.c index 4b7cd10..15ef265 100644 --- a/bcftools/mut.c +++ b/bcftools/mut.c @@ -33,6 +33,7 @@ uint32_t *bcf_trio_prep(int is_x, int is_son) return ret; } + int bcf_trio_call(const uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt) { int i, j, k; @@ -44,7 +45,9 @@ int bcf_trio_call(const uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt) if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; if (i == b->n_gi) return -1; // no PL gl10 = alloca(10 * b->n_smpl); - if (bcf_gl10(b, gl10) < 0) return -1; + if (bcf_gl10(b, gl10) < 0) { + if (bcf_gl10_indel(b, gl10) < 0) return -1; + } PL = b->gi + i; for (i = 0, k = 0; i < 4; ++i) for (j = i; j < 4; ++j) diff --git a/bedidx.c b/bedidx.c index 34f0f2f..ec75a10 100644 --- a/bedidx.c +++ b/bedidx.c @@ -4,6 +4,10 @@ #include #include +#ifdef _WIN32 +#define drand48() ((double)rand() / RAND_MAX) +#endif + #include "ksort.h" KSORT_INIT_GENERIC(uint64_t) diff --git a/kprobaln.c b/kprobaln.c index 5201c1a..894a2ae 100644 --- a/kprobaln.c +++ b/kprobaln.c @@ -161,7 +161,7 @@ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_quer double p = 1., Pr1 = 0.; for (i = 0; i <= l_query + 1; ++i) { p *= s[i]; - if (p < 1e-100) Pr += -4.343 * log(p), p = 1.; + if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.; } Pr1 += -4.343 * log(p * l_ref * l_query); Pr = (int)(Pr1 + .499); diff --git a/kseq.h b/kseq.h index 82face0..0bbc7dc 100644 --- a/kseq.h +++ b/kseq.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008 Genome Research Ltd (GRL). + Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,13 +23,7 @@ SOFTWARE. */ -/* Contact: Heng Li */ - -/* - 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*" - */ - -/* Last Modified: 12APR2009 */ +/* Last Modified: 18AUG2011 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -94,10 +88,10 @@ typedef struct __kstring_t { #endif #define __KS_GETUNTIL(__read, __bufsize) \ - static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ if (dret) *dret = 0; \ - str->l = 0; \ + str->l = append? str->l : 0; \ if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ @@ -132,13 +126,15 @@ typedef struct __kstring_t { break; \ } \ } \ - if (str->l == 0) { \ + if (str->s == 0) { \ str->m = 1; \ str->s = (char*)calloc(1, 1); \ } \ str->s[str->l] = '\0'; \ return str->l; \ - } + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } #define KSTREAM_INIT(type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ @@ -171,44 +167,45 @@ typedef struct __kstring_t { -1 end-of-file -2 truncated quality string */ -#define __KSEQ_READ \ - static int kseq_read(kseq_t *seq) \ - { \ - int c; \ - kstream_t *ks = seq->f; \ +#define __KSEQ_READ \ + static int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* the first header char has been read */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ - if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ - if (isgraph(c)) { /* printable non-space character */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l++] = (char)c; \ - } \ - } \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, '\n', &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ - if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ - } \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ - if (c == -1) return -2; /* we should not stop here */ \ - while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ - if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ - seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while (ks_getuntil2(ks, '\n', &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ seq->last_char = 0; /* we have not come to the next header line */ \ - if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ - return seq->seq.l; \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ } #define __KSEQ_TYPE(type_t) \ @@ -219,7 +216,7 @@ typedef struct __kstring_t { } kseq_t; #define KSEQ_INIT(type_t, __read) \ - KSTREAM_INIT(type_t, __read, 4096) \ + KSTREAM_INIT(type_t, __read, 16384) \ __KSEQ_TYPE(type_t) \ __KSEQ_BASIC(type_t) \ __KSEQ_READ diff --git a/misc/Makefile b/misc/Makefile index 6c25c78..d2f8bd8 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 #-m64 #-arch ppc CXXFLAGS= $(CFLAGS) DFLAGS= -D_FILE_OFFSET_BITS=64 OBJS= -PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim +PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim seqtk INCLUDES= -I.. SUBDIRS= . @@ -27,11 +27,11 @@ lib-recur all-recur clean-recur cleanlocal-recur install-recur: lib: -afs2:afs2.o - $(CC) $(CFLAGS) -o $@ afs2.o -lm -lz -L.. -lbam +seqtk:seqtk.o + $(CC) $(CFLAGS) -o $@ seqtk.o -lm -lz wgsim:wgsim.o - $(CC) $(CFLAGS) -o $@ wgsim.o -lm + $(CC) $(CFLAGS) -o $@ wgsim.o -lm -lz md5fa:md5.o md5fa.o md5.h ../kseq.h $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz @@ -51,8 +51,11 @@ maq2sam-long:maq2sam.c md5fa.o:md5.h md5fa.c $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c -afs2.o:afs2.c ../bam.h - $(CC) $(CFLAGS) -c -I.. -o $@ afs2.c +seqtk.o:seqtk.c ../khash.h ../kseq.h + $(CC) $(CFLAGS) -c -I.. -o $@ seqtk.c + +wgsim.o:wgsim.c ../kseq.h + $(CC) $(CFLAGS) -c -I.. -o $@ wgsim.c cleanlocal: rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a diff --git a/misc/seqtk.c b/misc/seqtk.c new file mode 100644 index 0000000..591ddff --- /dev/null +++ b/misc/seqtk.c @@ -0,0 +1,783 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +typedef struct { + int n, m; + uint64_t *a; +} reglist_t; + +#include "khash.h" +KHASH_MAP_INIT_STR(reg, reglist_t) + +typedef kh_reg_t reghash_t; + +reghash_t *stk_reg_read(const char *fn) +{ + reghash_t *h = kh_init(reg); + gzFile fp; + kstream_t *ks; + int dret; + kstring_t *str; + // read the list + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + int beg = -1, end = -1; + reglist_t *p; + khint_t k = kh_get(reg, h, str->s); + if (k == kh_end(h)) { + int ret; + char *s = strdup(str->s); + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(reglist_t)); + } + p = &kh_val(h, k); + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + beg = atoi(str->s); + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + end = atoi(str->s); + if (end < 0) end = -1; + } + } + } + } + // skip the rest of the line + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column + if (beg < 0) beg = 0, end = INT_MAX; + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | end; + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + return h; +} + +void stk_reg_destroy(reghash_t *h) +{ + khint_t k; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); +} + +/* constant table */ + +unsigned char seq_nt16_table[256] = { + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15 /*'-'*/,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 +}; + +char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN"; +unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; +int bitcnt_table[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; + +/* composition */ +int stk_comp(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l, c, upper_only = 0; + reghash_t *h = 0; + reglist_t dummy; + while ((c = getopt(argc, argv, "ur:")) >= 0) { + switch (c) { + case 'u': upper_only = 1; break; + case 'r': h = stk_reg_read(optarg); break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: seqtk comp [-u] [-r in.bed] \n\n"); + fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n"); + return 1; + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + seq = kseq_init(fp); + dummy.n= dummy.m = 1; dummy.a = calloc(1, 8); + while ((l = kseq_read(seq)) >= 0) { + int i, k; + reglist_t *p = 0; + if (h) { + khint_t k = kh_get(reg, h, seq->name.s); + if (k != kh_end(h)) p = &kh_val(h, k); + } else { + p = &dummy; + dummy.a[0] = l; + } + for (k = 0; p && k < p->n; ++k) { + int beg = p->a[k]>>32, end = p->a[k]&0xffffffff; + int la, lb, lc, na, nb, nc, cnt[11]; + if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb]; + else la = 'a', lb = -1, lc = 0; + na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; + memset(cnt, 0, 11 * sizeof(int)); + for (i = beg; i < end; ++i) { + int is_CpG = 0, a, b, c; + a = na; b = nb; c = nc; + na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; + if (b == 2 || b == 10) { // C or Y + if (nb == 4 || nb == 5) is_CpG = 1; + } else if (b == 4 || b == 5) { // G or R + if (lb == 2 || lb == 10) is_CpG = 1; + } + if (upper_only == 0 || isupper(a)) { + if (c > 1) ++cnt[c+2]; + if (c == 1) ++cnt[seq_nt16to4_table[b]]; + if (b == 10 || b == 5) ++cnt[9]; + else if (c == 2) { + ++cnt[8]; + } + if (is_CpG) { + ++cnt[7]; + if (b == 10 || b == 5) ++cnt[10]; + } + } + la = a; lb = b; lc = c; + } + if (h) printf("%s\t%d\t%d", seq->name.s, beg, end); + else printf("%s\t%d", seq->name.s, l); + for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]); + putchar('\n'); + } + fflush(stdout); + } + free(dummy.a); + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_randbase(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l; + if (argc == 1) { + fprintf(stderr, "Usage: seqtk randbase \n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + int i; + printf(">%s", seq->name.s); + for (i = 0; i < l; ++i) { + int c, b, a, j, k, m; + b = seq->seq.s[i]; + c = seq_nt16_table[b]; + a = bitcnt_table[c]; + if (a == 2) { + m = (drand48() < 0.5); + for (j = k = 0; j < 4; ++j) { + if ((1<seq.s[i] = islower(b)? "acgt"[j] : "ACGT"[j]; + } + if (i%60 == 0) putchar('\n'); + putchar(seq->seq.s[i]); + } + putchar('\n'); + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_hety(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0; + char *buf; + uint32_t cnt[3]; + if (argc == 1) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk hety [options] \n\n"); + fprintf(stderr, "Options: -w INT window size [%d]\n", win_size); + fprintf(stderr, " -t INT # start positions in a window [%d]\n", n_start); + fprintf(stderr, " -m treat lowercases as masked\n"); + fprintf(stderr, "\n"); + return 1; + } + while ((c = getopt(argc, argv, "w:t:m")) >= 0) { + switch (c) { + case 'w': win_size = atoi(optarg); break; + case 't': n_start = atoi(optarg); break; + case 'm': is_lower_mask = 1; break; + } + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + seq = kseq_init(fp); + win_step = win_size / n_start; + buf = calloc(win_size, 1); + while ((l = kseq_read(seq)) >= 0) { + int x, i, y, z, next = 0; + cnt[0] = cnt[1] = cnt[2] = 0; + for (i = 0; i <= l; ++i) { + if ((i >= win_size && i % win_step == 0) || i == l) { + if (i == l && l >= win_size) { + for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]]; + } + if (cnt[1] + cnt[2] > 0) + printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i, + (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]); + next = i; + } + if (i < l) { + y = i % win_size; + c = seq->seq.s[i]; + if (is_lower_mask && islower(c)) c = 'N'; + c = seq_nt16_table[c]; + x = bitcnt_table[c]; + if (i >= win_size) --cnt[(int)buf[y]]; + buf[y] = z = x > 2? 0 : x == 2? 2 : 1; + ++cnt[z]; + } + } + } + free(buf); + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +/* fq2fa */ +int stk_fq2fa(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + char *buf; + int l, i, c, qual_thres = 0, linelen = 60; + while ((c = getopt(argc, argv, "q:l:")) >= 0) { + switch (c) { + case 'q': qual_thres = atoi(optarg); break; + case 'l': linelen = atoi(optarg); break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: seqtk fq2fa [-q qualThres=0] [-l lineLen=60] \n"); + return 1; + } + buf = linelen > 0? malloc(linelen + 1) : 0; + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + if (seq->qual.l && qual_thres > 0) { + for (i = 0; i < l; ++i) + if (seq->qual.s[i] - 33 < qual_thres) + seq->seq.s[i] = tolower(seq->seq.s[i]); + } + putchar('>'); + if (seq->comment.l) { + fputs(seq->name.s, stdout); + putchar(' '); + puts(seq->comment.s); + } else puts(seq->name.s); + if (buf) { // multi-line + for (i = 0; i < l; i += linelen) { + int x = i + linelen < l? linelen : l - i; + memcpy(buf, seq->seq.s + i, x); + buf[x] = 0; + puts(buf); + } + } else puts(seq->seq.s); + } + free(buf); + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_maskseq(int argc, char *argv[]) +{ + khash_t(reg) *h = kh_init(reg); + gzFile fp; + kseq_t *seq; + int l, i, j, c, is_complement = 0, is_lower = 0; + khint_t k; + while ((c = getopt(argc, argv, "cl")) >= 0) { + switch (c) { + case 'c': is_complement = 1; break; + case 'l': is_lower = 1; break; + } + } + if (argc - optind < 2) { + fprintf(stderr, "Usage: seqtk maskseq [-cl] \n\n"); + fprintf(stderr, "Options: -c mask the complement regions\n"); + fprintf(stderr, " -l soft mask (to lower cases)\n"); + return 1; + } + h = stk_reg_read(argv[optind+1]); + // maskseq + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + k = kh_get(reg, h, seq->name.s); + if (k == kh_end(h)) { // not found in the hash table + if (is_complement) { + for (j = 0; j < l; ++j) + seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N'; + } + } else { + reglist_t *p = &kh_val(h, k); + if (!is_complement) { + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32, end = p->a[i]; + if (beg >= seq->seq.l) { + fprintf(stderr, "[maskseq] start position >= the sequence length.\n"); + continue; + } + if (end >= seq->seq.l) end = seq->seq.l; + if (is_lower) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]); + else for (j = beg; j < end; ++j) seq->seq.s[j] = 'N'; + } + } else { + int8_t *mask = calloc(seq->seq.l, 1); + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32, end = p->a[i]; + if (end >= seq->seq.l) end = seq->seq.l; + for (j = beg; j < end; ++j) mask[j] = 1; + } + for (j = 0; j < l; ++j) + if (mask[j] == 0) seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N'; + free(mask); + } + } + printf(">%s", seq->name.s); + for (j = 0; j < seq->seq.l; ++j) { + if (j%60 == 0) putchar('\n'); + putchar(seq->seq.s[j]); + } + putchar('\n'); + } + // free + kseq_destroy(seq); + gzclose(fp); + stk_reg_destroy(h); + return 0; +} + +/* subseq */ + +int stk_subseq(int argc, char *argv[]) +{ + khash_t(reg) *h = kh_init(reg); + gzFile fp; + kseq_t *seq; + int l, i, j, c, is_tab = 0; + khint_t k; + while ((c = getopt(argc, argv, "t")) >= 0) { + switch (c) { + case 't': is_tab = 1; break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: seqtk subseq [-t] \n\n"); + fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n"); + return 1; + } + h = stk_reg_read(argv[optind+1]); + // subseq + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + reglist_t *p; + k = kh_get(reg, h, seq->name.s); + if (k == kh_end(h)) continue; + p = &kh_val(h, k); + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32, end = p->a[i]; + if (beg >= seq->seq.l) { + fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l); + continue; + } + if (end > seq->seq.l) end = seq->seq.l; + if (is_tab == 0) { + printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s); + if (end == INT_MAX) { + if (beg) printf(":%d", beg+1); + } else printf(":%d-%d", beg+1, end); + } else printf("%s\t%d\t", seq->name.s, beg + 1); + if (end > seq->seq.l) end = seq->seq.l; + for (j = 0; j < end - beg; ++j) { + if (is_tab == 0 && j % 60 == 0) putchar('\n'); + putchar(seq->seq.s[j + beg]); + } + putchar('\n'); + if (seq->qual.l != seq->seq.l || is_tab) continue; + printf("+"); + for (j = 0; j < end - beg; ++j) { + if (j % 60 == 0) putchar('\n'); + putchar(seq->qual.s[j + beg]); + } + putchar('\n'); + } + } + // free + kseq_destroy(seq); + gzclose(fp); + stk_reg_destroy(h); + return 0; +} + +/* mergefa */ +int stk_mergefa(int argc, char *argv[]) +{ + gzFile fp[2]; + kseq_t *seq[2]; + int i, l, c, is_intersect = 0, is_haploid = 0, qual = 0, is_mask = 0; + while ((c = getopt(argc, argv, "himq:")) >= 0) { + switch (c) { + case 'i': is_intersect = 1; break; + case 'h': is_haploid = 1; break; + case 'm': is_mask = 1; break; + case 'q': qual = atoi(optarg); break; + } + } + if (is_mask && is_intersect) { + fprintf(stderr, "[%s] `-i' and `-h' cannot be applied at the same time.\n", __func__); + return 1; + } + if (optind + 2 > argc) { + fprintf(stderr, "\nUsage: seqtk mergefa [options] \n\n"); + fprintf(stderr, "Options: -q INT quality threshold [0]\n"); + fprintf(stderr, " -i take intersection\n"); + fprintf(stderr, " -m convert to lowercase when one of the input base is N.\n"); + fprintf(stderr, " -h suppress hets in the input\n\n"); + return 1; + } + for (i = 0; i < 2; ++i) { + fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); + seq[i] = kseq_init(fp[i]); + } + while (kseq_read(seq[0]) >= 0) { + int min_l, c[2], is_upper; + kseq_read(seq[1]); + if (strcmp(seq[0]->name.s, seq[1]->name.s)) + fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); + if (seq[0]->seq.l != seq[1]->seq.l) + fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); + min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; + printf(">%s", seq[0]->name.s); + for (l = 0; l < min_l; ++l) { + c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; + if (seq[0]->qual.l && seq[0]->qual.s[l] - 33 < qual) c[0] = tolower(c[0]); + if (seq[1]->qual.l && seq[1]->qual.s[l] - 33 < qual) c[1] = tolower(c[1]); + if (is_intersect) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; + else if (is_mask) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; + else is_upper = (isupper(c[0]) && isupper(c[1]))? 1 : 0; + c[0] = seq_nt16_table[c[0]]; c[1] = seq_nt16_table[c[1]]; + if (c[0] == 0) c[0] = 15; + if (c[1] == 0) c[1] = 15; + if (is_haploid && (bitcnt_table[c[0]] > 1 || bitcnt_table[c[1]] > 1)) is_upper = 0; + if (is_intersect) { + c[0] = c[0] & c[1]; + if (c[0] == 0) is_upper = 0; + } else if (is_mask) { + if (c[0] == 15 || c[1] == 15) is_upper = 0; + c[0] = c[0] & c[1]; + if (c[0] == 0) is_upper = 0; + } else c[0] = c[0] | c[1]; + c[0] = seq_nt16_rev_table[c[0]]; + if (!is_upper) c[0] = tolower(c[0]); + if (l%60 == 0) putchar('\n'); + putchar(c[0]); + } + putchar('\n'); + } + return 0; +} + +int stk_famask(int argc, char *argv[]) +{ + gzFile fp[2]; + kseq_t *seq[2]; + int i, l; + if (argc < 3) { + fprintf(stderr, "Usage: seqtk famask \n"); + return 1; + } + for (i = 0; i < 2; ++i) { + fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); + seq[i] = kseq_init(fp[i]); + } + while (kseq_read(seq[0]) >= 0) { + int min_l, c[2]; + kseq_read(seq[1]); + if (strcmp(seq[0]->name.s, seq[1]->name.s)) + fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); + if (seq[0]->seq.l != seq[1]->seq.l) + fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); + min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; + printf(">%s", seq[0]->name.s); + for (l = 0; l < min_l; ++l) { + c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; + if (c[1] == 'x') c[0] = tolower(c[0]); + else if (c[1] != 'X') c[0] = c[1]; + if (l%60 == 0) putchar('\n'); + putchar(c[0]); + } + putchar('\n'); + } + return 0; +} + +int stk_mutfa(int argc, char *argv[]) +{ + khash_t(reg) *h = kh_init(reg); + gzFile fp; + kseq_t *seq; + kstream_t *ks; + int l, i, dret; + kstring_t *str; + khint_t k; + if (argc < 3) { + fprintf(stderr, "Usage: seqtk mutfa \n\n"); + fprintf(stderr, "Note: contains at least four columns per line which are:\n"); + fprintf(stderr, " 'chr 1-based-pos any base-changed-to'.\n"); + return 1; + } + // read the list + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + char *s = strdup(str->s); + int beg = 0, ret; + reglist_t *p; + k = kh_get(reg, h, s); + if (k == kh_end(h)) { + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(reglist_t)); + } + p = &kh_val(h, k); + if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col + ks_getuntil(ks, 0, str, &dret); // 3rd col + ks_getuntil(ks, 0, str, &dret); // 4th col + // skip the rest of the line + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (isalpha(str->s[0]) && str->l == 1) { + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | str->s[0]; + } + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + // mutfa + fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + reglist_t *p; + k = kh_get(reg, h, seq->name.s); + if (k != kh_end(h)) { + p = &kh_val(h, k); + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32; + if (beg < seq->seq.l) + seq->seq.s[beg] = (int)p->a[i]; + } + } + printf(">%s", seq->name.s); + for (i = 0; i < l; ++i) { + if (i%60 == 0) putchar('\n'); + putchar(seq->seq.s[i]); + } + putchar('\n'); + } + // free + kseq_destroy(seq); + gzclose(fp); + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); + return 0; +} + +int stk_listhet(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int i, l; + if (argc == 1) { + fprintf(stderr, "Usage: seqtk listhet \n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + for (i = 0; i < l; ++i) { + int b = seq->seq.s[i]; + if (bitcnt_table[seq_nt16_table[b]] == 2) + printf("%s\t%d\t%c\n", seq->name.s, i+1, b); + } + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +/* cutN */ +static int cutN_min_N_tract = 1000; +static int cutN_nonN_penalty = 10; + +static int find_next_cut(const kseq_t *ks, int k, int *begin, int *end) +{ + int i, b, e; + while (k < ks->seq.l) { + if (seq_nt16_table[(int)ks->seq.s[k]] == 15) { + int score, max; + score = 0; e = max = -1; + for (i = k; i < ks->seq.l && score >= 0; ++i) { /* forward */ + if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; + else score -= cutN_nonN_penalty; + if (score > max) max = score, e = i; + } + score = 0; b = max = -1; + for (i = e; i >= 0 && score >= 0; --i) { /* backward */ + if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; + else score -= cutN_nonN_penalty; + if (score > max) max = score, b = i; + } + if (e + 1 - b >= cutN_min_N_tract) { + *begin = b; + *end = e + 1; + return *end; + } + k = e + 1; + } else ++k; + } + return -1; +} +static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end) +{ + int i; + if (begin >= end) return; // FIXME: why may this happen? Understand it! + fprintf(fpout, ">%s:%d-%d", ks->name.s, begin+1, end); + for (i = begin; i < end && i < ks->seq.l; ++i) { + if ((i - begin)%60 == 0) fputc('\n', fpout); + fputc(ks->seq.s[i], fpout); + } + fputc('\n', fpout); +} +int stk_cutN(int argc, char *argv[]) +{ + int c, l, gap_only = 0; + gzFile fp; + kseq_t *ks; + while ((c = getopt(argc, argv, "n:p:g")) >= 0) { + switch (c) { + case 'n': cutN_min_N_tract = atoi(optarg); break; + case 'p': cutN_nonN_penalty = atoi(optarg); break; + case 'g': gap_only = 1; break; + default: return 1; + } + } + if (argc == optind) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk cutN [options] \n\n"); + fprintf(stderr, "Options: -n INT min size of N tract [%d]\n", cutN_min_N_tract); + fprintf(stderr, " -p INT penalty for a non-N [%d]\n", cutN_nonN_penalty); + fprintf(stderr, " -g print gaps only, no sequence\n\n"); + return 1; + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + ks = kseq_init(fp); + while ((l = kseq_read(ks)) >= 0) { + int k = 0, begin = 0, end = 0; + while (find_next_cut(ks, k, &begin, &end) >= 0) { + if (begin != 0) { + if (gap_only) printf("%s\t%d\t%d\n", ks->name.s, begin, end); + else print_seq(stdout, ks, k, begin); + } + k = end; + } + if (!gap_only) print_seq(stdout, ks, k, l); + } + kseq_destroy(ks); + gzclose(fp); + return 0; +} + +/* main function */ +static int usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk \n\n"); + fprintf(stderr, "Command: comp get the nucleotide composite of FASTA/Q\n"); + fprintf(stderr, " hety regional heterozygosity\n"); + fprintf(stderr, " fq2fa convert FASTQ to FASTA\n"); + fprintf(stderr, " subseq extract subsequences from FASTA/Q\n"); + fprintf(stderr, " maskseq mask sequences\n"); + fprintf(stderr, " mutfa point mutate FASTA at specified positions\n"); + fprintf(stderr, " mergefa merge two FASTA/Q files\n"); + fprintf(stderr, " randbase choose a random base from hets\n"); + fprintf(stderr, " cutN cut sequence at long N\n"); + fprintf(stderr, " listhet extract the position of each het\n"); + fprintf(stderr, "\n"); + return 1; +} + +int main(int argc, char *argv[]) +{ + if (argc == 1) return usage(); + if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1); + else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1); + else if (strcmp(argv[1], "fq2fa") == 0) stk_fq2fa(argc-1, argv+1); + else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1); + else if (strcmp(argv[1], "maskseq") == 0) stk_maskseq(argc-1, argv+1); + else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1); + else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1); + else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1); + else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1); + else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1); + else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1); + else { + fprintf(stderr, "[main] unrecognized commad '%s'. Abort!\n", argv[1]); + return 1; + } + return 0; +} diff --git a/misc/wgsim.c b/misc/wgsim.c index 7b5f095..b9c513c 100644 --- a/misc/wgsim.c +++ b/misc/wgsim.c @@ -1,6 +1,7 @@ /* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). + 2011 Heng Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,11 +24,8 @@ SOFTWARE. */ -/* Contact: Heng Li */ - /* This program is separated from maq's read simulator with Colin - * Hercus' modification to allow longer indels. Colin is the chief - * developer of novoalign. */ + * Hercus' modification to allow longer indels. */ #include #include @@ -38,8 +36,11 @@ #include #include #include +#include +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) -#define PACKAGE_VERSION "0.2.3" +#define PACKAGE_VERSION "0.3.0" const uint8_t nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -60,8 +61,6 @@ const uint8_t nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; -const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; - /* Simple normal random number generator, copied from genran.c */ double ran_normal() @@ -85,78 +84,6 @@ double ran_normal() } } -/* FASTA parser, copied from seq.c */ - -typedef struct { - int l, m; /* length and maximum buffer size */ - unsigned char *s; /* sequence */ -} seq_t; - -#define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0 - -static int SEQ_BLOCK_SIZE = 512; - -void seq_set_block_size(int size) -{ - SEQ_BLOCK_SIZE = size; -} - -int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment) -{ - int c, l, max; - char *p; - - c = 0; - while (!feof(fp) && fgetc(fp) != '>'); - if (feof(fp)) return -1; - p = locus; - while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n') - if (c != '\r') *p++ = c; - *p = '\0'; - if (comment) { - p = comment; - if (c != '\n') { - while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t')); - if (c != '\n') { - *p++ = c; - while (!feof(fp) && (c = fgetc(fp)) != '\n') - if (c != '\r') *p++ = c; - } - } - *p = '\0'; - } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n'); - l = 0; max = seq->m; - while (!feof(fp) && (c = fgetc(fp)) != '>') { - if (isalpha(c) || c == '-' || c == '.') { - if (l + 1 >= max) { - max += SEQ_BLOCK_SIZE; - seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max); - } - seq->s[l++] = (unsigned char)c; - } - } - if (c == '>') ungetc(c,fp); - seq->s[l] = 0; - seq->m = max; seq->l = l; - return l; -} - -/* Error-checking open, copied from utils.c */ - -#define xopen(fn, mode) err_xopen_core(__func__, fn, mode) - -FILE *err_xopen_core(const char *func, const char *fn, const char *mode) -{ - FILE *fp = 0; - if (strcmp(fn, "-") == 0) - return (strstr(mode, "r"))? stdin : stdout; - if ((fp = fopen(fn, mode)) == 0) { - fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); - abort(); - } - return fp; -} - /* wgsim */ enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000}; @@ -170,24 +97,23 @@ typedef struct { static double ERR_RATE = 0.02; static double MUT_RATE = 0.001; -static double INDEL_FRAC = 0.1; +static double INDEL_FRAC = 0.15; static double INDEL_EXTEND = 0.3; -static int IS_SOLID = 0; -static int SHOW_MM_INFO = 1; +static double MAX_N_RATIO = 0.1; -void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2) +void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2) { int i, deleting = 0; mutseq_t *ret[2]; ret[0] = hap1; ret[1] = hap2; - ret[0]->l = seq->l; ret[1]->l = seq->l; - ret[0]->m = seq->m; ret[1]->m = seq->m; - ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); - ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); - for (i = 0; i != seq->l; ++i) { + ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l; + ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m; + ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t)); + ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t)); + for (i = 0; i != ks->seq.l; ++i) { int c; - c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]]; + c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]]; if (deleting) { if (drand48() < INDEL_EXTEND) { if (deleting & 1) ret[0]->s[i] |= DELETE; @@ -230,12 +156,12 @@ void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2) } } } -void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2) +void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2) { int i; - for (i = 0; i != seq->l; ++i) { + for (i = 0; i != ks->seq.l; ++i) { int c[3]; - c[0] = nst_nt4_table[(int)seq->s[i]]; + c[0] = nst_nt4_table[(int)ks->seq.s[i]]; c[1] = hap1->s[i]; c[2] = hap2->s[i]; if (c[0] >= 4) continue; if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) { @@ -248,8 +174,9 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins printf("-\t"); int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; - while(n > 0) { + while (n > 0) { putchar("ACGTN"[ins & 0x3]); + ins >>= 2; n--; } printf("\t-\n"); @@ -266,6 +193,7 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; while (n > 0) { putchar("ACGTN"[ins & 0x3]); + ins >>= 2; n--; } printf("\t+\n"); @@ -284,46 +212,51 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq } } -void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) +void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) { - seq_t seq; + kseq_t *ks; mutseq_t rseq[2]; + gzFile fp_fa; uint64_t tot_len, ii; int i, l, n_ref; - char name[256], *qstr; - int size[2], Q; + char *qstr; + int size[2], Q, max_size; uint8_t *tmp_seq[2]; mut_t *target; - INIT_SEQ(seq); - srand48(time(0)); - seq_set_block_size(0x1000000); l = size_l > size_r? size_l : size_r; qstr = (char*)calloc(l+1, 1); tmp_seq[0] = (uint8_t*)calloc(l+2, 1); tmp_seq[1] = (uint8_t*)calloc(l+2, 1); size[0] = size_l; size[1] = size_r; + max_size = size_l > size_r? size_l : size_r; Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; + fp_fa = gzopen(fn, "r"); + ks = kseq_init(fp_fa); tot_len = n_ref = 0; - while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { + fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__); + while ((l = kseq_read(ks)) >= 0) { tot_len += l; ++n_ref; } - fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len); - rewind(fp_fa); + fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len); + kseq_destroy(ks); + gzclose(fp_fa); - while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { + fp_fa = gzopen(fn, "r"); + ks = kseq_init(fp_fa); + while ((l = kseq_read(ks)) >= 0) { uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5); if (l < dist + 3 * std_dev) { - fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev); + fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev); continue; } // generate mutations and print them out - maq_mut_diref(&seq, is_hap, rseq, rseq+1); - maq_print_mutref(name, &seq, rseq, rseq+1); + wgsim_mut_diref(ks, is_hap, rseq, rseq+1); + wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1); for (ii = 0; ii != n_pairs; ++ii) { // the core loop double ran; @@ -335,8 +268,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, ran = ran_normal(); ran = ran * std_dev + dist; d = (int)(ran + 0.5); + d = d > max_size? d : max_size; pos = (int)((l - d + 1) * drand48()); - } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l); + } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l); // flip or not if (drand48() < 0.5) { @@ -353,7 +287,7 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0; #define __gen_read(x, start, iter) do { \ - for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \ + for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) { \ int c = target[i], mut_type = c & mutmsk; \ if (ext_coor[x] < 0) { \ if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \ @@ -374,33 +308,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, if (k != s[x]) ext_coor[x] = -10; \ } while (0) - if (!IS_SOLID) { - __gen_read(0, pos, ++i); - __gen_read(1, pos + d - 1, --i); - for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement - } else { - int c1, c2, c; - ++s[0]; ++s[1]; // temporarily increase read length by 1 - if (is_flip) { // RR pair - __gen_read(0, pos + s[0], --i); - __gen_read(1, pos + d - 1, --i); - } else { // FF pair - __gen_read(0, pos, ++i); - __gen_read(1, pos + d - 1 - s[1], ++i); - ++ext_coor[0]; ++ext_coor[1]; - } - // change to color sequence: (0,1,2,3) -> (A,C,G,T) - for (j = 0; j < 2; ++j) { - c1 = tmp_seq[j][0]; - for (i = 1; i < s[j]; ++i) { - c2 = tmp_seq[j][i]; - c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<= 4) c = 4; // actually c should be never larger than 4 if everything is correct - else if (drand48() < ERR_RATE) { - c = (c + (int)(drand48() * 3.0 + 1)) & 3; + if (c >= 4) { // actually c should be never larger than 4 if everything is correct + c = 4; + ++n_n; + } else if (drand48() < ERR_RATE) { + // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors + c = (c + 1) & 3; // recurrent sequencing errors ++n_err[j]; } tmp_seq[j][i] = c; } + if ((double)n_n / s[j] > MAX_N_RATIO) break; + } + if (j < 2) { // too many ambiguous bases on one of the reads + --ii; + continue; } // print for (j = 0; j < 2; ++j) { for (i = 0; i < s[j]; ++i) qstr[i] = Q; qstr[i] = 0; - if (SHOW_MM_INFO) { - fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1, - n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], - (long long)ii, j==0? is_flip+1 : 2-is_flip); - } else { - fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1, - (long long)ii, j==0? is_flip+1 : 2-is_flip, - n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]); - } + fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1, + n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], + (long long)ii, j==0? is_flip+1 : 2-is_flip); for (i = 0; i < s[j]; ++i) fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]); fprintf(fpo[j], "\n+\n%s\n", qstr); @@ -439,7 +352,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, } free(rseq[0].s); free(rseq[1].s); } - free(seq.s); free(qstr); + kseq_destroy(ks); + gzclose(fp_fa); + free(qstr); free(tmp_seq[0]); free(tmp_seq[1]); } @@ -459,11 +374,9 @@ static int simu_usage() fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE); fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC); fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND); - fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n"); - fprintf(stderr, " -C show mismatch info in comment rather than read name\n"); + fprintf(stderr, " -S INT seed for random generator [-1]\n"); fprintf(stderr, " -h haplotype mode\n"); fprintf(stderr, "\n"); - fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n"); return 1; } @@ -471,11 +384,12 @@ int main(int argc, char *argv[]) { int64_t N; int dist, std_dev, c, size_l, size_r, is_hap = 0; - FILE *fpout1, *fpout2, *fp_fa; + FILE *fpout1, *fpout2; + int seed = -1; N = 1000000; dist = 500; std_dev = 50; size_l = size_r = 70; - while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) { + while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:")) >= 0) { switch (c) { case 'd': dist = atoi(optarg); break; case 's': std_dev = atoi(optarg); break; @@ -486,17 +400,20 @@ int main(int argc, char *argv[]) case 'r': MUT_RATE = atof(optarg); break; case 'R': INDEL_FRAC = atof(optarg); break; case 'X': INDEL_EXTEND = atof(optarg); break; - case 'c': IS_SOLID = 1; break; - case 'C': SHOW_MM_INFO = 0; break; + case 'S': seed = atoi(optarg); break; case 'h': is_hap = 1; break; } } if (argc - optind < 3) return simu_usage(); - fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r"); - fpout1 = xopen(argv[optind+1], "w"); - fpout2 = xopen(argv[optind+2], "w"); - wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r); + fpout1 = fopen(argv[optind+1], "w"); + fpout2 = fopen(argv[optind+2], "w"); + if (!fpout1 || !fpout2) { + fprintf(stderr, "[wgsim] file open error\n"); + return 1; + } + srand48(seed > 0? seed : time(0)); + wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r); - fclose(fpout1); fclose(fpout2); fclose(fp_fa); + fclose(fpout1); fclose(fpout2); return 0; } diff --git a/sam_view.c b/sam_view.c index 0821a61..efda4e8 100644 --- a/sam_view.c +++ b/sam_view.c @@ -19,8 +19,10 @@ typedef struct { typedef khash_t(rg) *rghash_t; +// FIXME: we'd better use no global variables... static rghash_t g_rghash = 0; static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; +static float g_subsam = -1; static char *g_library, *g_rg; static void *g_bed; @@ -34,6 +36,11 @@ static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) return 1; if (g_bed && b->core.tid >= 0 && !bed_overlap(g_bed, h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)))) return 1; + if (g_subsam > 0.) { + int x = (int)(g_subsam + .499); + uint32_t k = __ac_X31_hash_string(bam1_qname(b)) + x; + if (k%1024 / 1024.0 >= g_subsam - x) return 1; + } if (g_rg || g_rghash) { uint8_t *s = bam_aux_get(b, "RG"); if (s) { @@ -111,8 +118,9 @@ int main_samview(int argc, char *argv[]) /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:R:L:")) >= 0) { + while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:R:L:s:")) >= 0) { switch (c) { + case 's': g_subsam = atof(optarg); break; case 'c': is_count = 1; break; case 'S': is_bamin = 0; break; case 'b': is_bamout = 1; break; @@ -279,6 +287,7 @@ static int usage(int is_long_help) fprintf(stderr, " -q INT minimum mapping quality [0]\n"); fprintf(stderr, " -l STR only output reads in library STR [null]\n"); fprintf(stderr, " -r STR only output reads in read group STR [null]\n"); + fprintf(stderr, " -s FLOAT fraction of templates to subsample; integer part as seed [-1]\n"); fprintf(stderr, " -? longer help\n"); fprintf(stderr, "\n"); if (is_long_help) diff --git a/sample.c b/sample.c index 4e4b8a4..830b9d1 100644 --- a/sample.c +++ b/sample.c @@ -52,7 +52,7 @@ static void add_pair(bam_sample_t *sm, khash_t(sm) *sm2id, const char *key, cons int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) { const char *p = txt, *q, *r; - kstring_t buf; + kstring_t buf, first_sm; int n = 0; khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id; if (txt == 0) { @@ -60,6 +60,7 @@ int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) return 0; } memset(&buf, 0, sizeof(kstring_t)); + memset(&first_sm, 0, sizeof(kstring_t)); while ((q = strstr(p, "@RG")) != 0) { p = q + 3; r = q = 0; @@ -73,12 +74,21 @@ int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) oq = *u; or = *v; *u = *v = '\0'; buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf); add_pair(sm, sm2id, buf.s, r); + if ( !first_sm.s ) + kputs(r,&first_sm); *u = oq; *v = or; } else break; p = q > r? q : r; ++n; } if (n == 0) add_pair(sm, sm2id, fn, fn); + // If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but + // use the tag instead. + else if ( n==1 && first_sm.s ) + add_pair(sm,sm2id,fn,first_sm.s); + if ( first_sm.s ) + free(first_sm.s); + // add_pair(sm, sm2id, fn, fn); free(buf.s); return 0; -- 2.30.2