From: Charles Plessy Date: Sat, 7 May 2011 08:30:17 +0000 (+0900) Subject: Merge commit 'upstream/0.1.16' X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=commitdiff_plain;h=b7c06ea4740153b8f27c7c2374131dbd607b6113;hp=cdbe062086fb28ae4cc8dd0bfb224592bfb40d7d Merge commit 'upstream/0.1.16' --- diff --git a/NEWS b/NEWS index de55679..a600bb1 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,40 @@ +Beta Release 0.1.16 (21 April, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in samtools: + + * Support the new SAM/BAM type `B' in the latest SAM spec v1.4. + + * When the output file of `samtools merge' exists, do not overwrite it unless + a new command-line option `-f' is applied. + + * Bugfix: BED support is not working when the input BED is not sorted. + + * Bugfix: some reads without coordinates but given on the reverse strand are + lost in merging. + +Notable changes in bcftools: + + * Code cleanup: separated max-likelihood inference and Bayesian inference. + + * Test Hardy-Weinberg equilibrium with a likelihood-ratio test. + + * Provided another association test P-value by likelihood-ratio test. + + * Use Brent's method to estimate the site allele frequency when EM converges + slowly. The resulting ML estimate of allele frequnecy is more accurate. + + * Added the `ldpair' command, which computes r^2 between SNP pairs given in + an input file. + +Also, the `pileup' command, which has been deprecated by `mpileup' since +version 0.1.10, will be dropped in the next release. The old `pileup' command +is substandard and causing a lot of confusion. + +(0.1.16: 21 April 2011, r963:234) + + + Beta Release 0.1.15 (10 April, 2011) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bam.c b/bam.c index a65aed5..5fac17d 100644 --- a/bam.c +++ b/bam.c @@ -160,6 +160,19 @@ static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } + else if (type == 'B') { + int32_t n, Bsize = bam_aux_type2size(*s); + memcpy(&n, s + 1, 4); + if (1 == Bsize) { + } else if (2 == Bsize) { + for (i = 0; i < n; i += 2) + bam_swap_endian_2p(s + 5 + i); + } else if (4 == Bsize) { + for (i = 0; i < n; i += 4) + bam_swap_endian_4p(s + 5 + i); + } + bam_swap_endian_4p(s+1); + } } } @@ -289,6 +302,23 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; } + else if (type == 'B') { + uint8_t sub_type = *(s++); + int32_t n; + memcpy(&n, s, 4); + s += 4; // no point to the start of the array + kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); // write the typing + for (i = 0; i < n; ++i) { + kputc(',', &str); + if ('c' == sub_type || 'c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; } + else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; } + else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; } + else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; } + else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; } + else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; } + else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; } + } + } } return str.s; } diff --git a/bam.h b/bam.h index 87e3de3..e7360bb 100644 --- a/bam.h +++ b/bam.h @@ -40,7 +40,7 @@ @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.15 (r949:203)" +#define BAM_VERSION "0.1.16 (r963:234)" #include #include @@ -746,4 +746,13 @@ static inline bam1_t *bam_dup1(const bam1_t *src) return b; } +static inline int bam_aux_type2size(int x) +{ + if (x == 'C' || x == 'c' || x == 'A') return 1; + else if (x == 'S' || x == 's') return 2; + else if (x == 'I' || x == 'i' || x == 'f') return 4; + else return 0; +} + + #endif diff --git a/bam_aux.c b/bam_aux.c index 4fd3190..b6a8d88 100644 --- a/bam_aux.c +++ b/bam_aux.c @@ -26,14 +26,12 @@ uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) } #define __skip_tag(s) do { \ - int type = toupper(*(s)); \ - ++(s); \ - if (type == 'C' || type == 'A') ++(s); \ - else if (type == 'S') (s) += 2; \ - else if (type == 'I' || type == 'F') (s) += 4; \ - else if (type == 'D') (s) += 8; \ - else if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ - } while (0) + int type = toupper(*(s)); \ + ++(s); \ + if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ + else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \ + else (s) += bam_aux_type2size(type); \ + } while(0) uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) { diff --git a/bam_import.c b/bam_import.c index 9d84328..8c24692 100644 --- a/bam_import.c +++ b/bam_import.c @@ -427,6 +427,27 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) memcpy(s, str->s + 5, str->l - 5); s[str->l - 5] = 0; doff += size; + } else if (type == 'B') { + int32_t n = 0, Bsize, k = 0, size; + char *p; + if (str->l < 8) parse_error(fp->n_lines, "too few values in aux type B"); + Bsize = bam_aux_type2size(str->s[5]); // the size of each element + for (p = (char*)str->s + 6; *p; ++p) // count the number of elements in the array + if (*p == ',') ++n; + p = str->s + 7; // now p points to the first number in the array + size = 6 + Bsize * n; // total number of bytes allocated to this tag + s = alloc_data(b, doff + 6 * Bsize * n) + doff; // allocate memory + *s++ = 'B'; *s++ = str->s[5]; + memcpy(s, &n, 4); s += 4; // write the number of elements + if (str->s[5] == 'c') while (p < str->s + str->l) ((int8_t*)s)[k++] = (int8_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'C') while (p < str->s + str->l) ((uint8_t*)s)[k++] = (uint8_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 's') while (p < str->s + str->l) ((int16_t*)s)[k++] = (int16_t)strtol(p, &p, 0), ++p; // FIXME: avoid unaligned memory + else if (str->s[5] == 'S') while (p < str->s + str->l) ((uint16_t*)s)[k++] = (uint16_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'i') while (p < str->s + str->l) ((int32_t*)s)[k++] = (int32_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'I') while (p < str->s + str->l) ((uint32_t*)s)[k++] = (uint32_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'f') while (p < str->s + str->l) ((float*)s)[k++] = (float)strtof(p, &p), ++p; + else parse_error(fp->n_lines, "unrecognized array type"); + s += Bsize * n; doff += size; } else parse_error(fp->n_lines, "unrecognized type"); if (dret == '\n' || dret == '\r') break; } diff --git a/bam_sort.c b/bam_sort.c index 94970b5..abf8d4f 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -71,6 +71,7 @@ static void swap_header_text(bam_header_t *h1, bam_header_t *h2) #define MERGE_RG 1 #define MERGE_UNCOMP 2 #define MERGE_LEVEL1 4 +#define MERGE_FORCE 8 /*! @abstract Merge multiple sorted BAM. @@ -203,7 +204,7 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch h->i = i; h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); if (bam_iter_read(fp[i], iter[i], h->b) >= 0) { - h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)h->b->core.pos<<1 | bam1_strand(h->b); + h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b); h->idx = idx++; } else h->pos = HEAP_EMPTY; @@ -228,7 +229,7 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch } bam_write1_core(fpout, &b->core, b->data_len, b->data); if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) { - heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b); + heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; @@ -256,9 +257,10 @@ int bam_merge(int argc, char *argv[]) int c, is_by_qname = 0, flag = 0, ret = 0; char *fn_headers = NULL, *reg = 0; - while ((c = getopt(argc, argv, "h:nru1R:")) >= 0) { + while ((c = getopt(argc, argv, "h:nru1R:f")) >= 0) { switch (c) { case 'r': flag |= MERGE_RG; break; + case 'f': flag |= MERGE_FORCE; break; case 'h': fn_headers = strdup(optarg); break; case 'n': is_by_qname = 1; break; case '1': flag |= MERGE_LEVEL1; break; @@ -272,6 +274,7 @@ int bam_merge(int argc, char *argv[]) fprintf(stderr, "Options: -n sort by read names\n"); fprintf(stderr, " -r attach RG tag (inferred from file names)\n"); fprintf(stderr, " -u uncompressed BAM output\n"); + fprintf(stderr, " -f overwrite the output BAM if exist\n"); fprintf(stderr, " -1 compress level 1\n"); fprintf(stderr, " -R STR merge file in the specified region STR [all]\n"); fprintf(stderr, " -h FILE copy the header in FILE to [in1.bam]\n\n"); @@ -280,6 +283,14 @@ int bam_merge(int argc, char *argv[]) fprintf(stderr, " the header dictionary in merging.\n\n"); return 1; } + if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) { + FILE *fp = fopen(argv[optind], "rb"); + if (fp != NULL) { + fclose(fp); + fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]); + return 1; + } + } if (bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg) < 0) ret = 1; free(reg); free(fn_headers); @@ -292,8 +303,8 @@ static inline int bam1_lt(const bam1_p a, const bam1_p b) { if (g_is_by_qname) { int t = strnum_cmp(bam1_qname(a), bam1_qname(b)); - return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos)))); - } else return (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos)); + return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|(a->core.pos+1)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1))))); + } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1))); } KSORT_INIT(sort, bam1_p, bam1_lt) diff --git a/bamtk.c b/bamtk.c index 5886570..0941f6f 100644 --- a/bamtk.c +++ b/bamtk.c @@ -31,46 +31,6 @@ int main_depth(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); int glf3_view_main(int argc, char *argv[]); -int bam_tagview(int argc, char *argv[]) -{ - bamFile fp; - bam_header_t *header; - bam1_t *b; - char tag[2]; - int ret; - if (argc < 3) { - fprintf(stderr, "Usage: samtools tagview \n"); - return 1; - } - fp = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); - assert(fp); - header = bam_header_read(fp); - if (header == 0) { - fprintf(stderr, "[bam_view] fail to read the BAM header. Abort!\n"); - return 1; - } - tag[0] = argv[2][0]; tag[1] = argv[2][1]; - b = (bam1_t*)calloc(1, sizeof(bam1_t)); - while ((ret = bam_read1(fp, b)) >= 0) { - uint8_t *d = bam_aux_get(b, tag); - if (d) { - printf("%s\t%d\t", bam1_qname(b), b->core.flag); - if (d[0] == 'Z' || d[0] == 'H') printf("%s\n", bam_aux2Z(d)); - else if (d[0] == 'f') printf("%f\n", bam_aux2f(d)); - else if (d[0] == 'd') printf("%lf\n", bam_aux2d(d)); - else if (d[0] == 'A') printf("%c\n", bam_aux2A(d)); - else if (d[0] == 'c' || d[0] == 's' || d[0] == 'i') printf("%d\n", bam_aux2i(d)); - else if (d[0] == 'C' || d[0] == 'S' || d[0] == 'I') printf("%u\n", bam_aux2i(d)); - else printf("\n"); - } - } - if (ret < -1) fprintf(stderr, "[bam_view] truncated file? Continue anyway. (%d)\n", ret); - free(b->data); free(b); - bam_header_destroy(header); - bam_close(fp); - return 0; -} - static int usage() { fprintf(stderr, "\n"); @@ -131,7 +91,6 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1); else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); - else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1); else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1); diff --git a/bcftools/Makefile b/bcftools/Makefile index b5a1b1b..fd2e2df 100644 --- a/bcftools/Makefile +++ b/bcftools/Makefile @@ -1,7 +1,7 @@ CC= gcc CFLAGS= -g -Wall -O2 #-m64 #-arch ppc DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -LOBJS= bcf.o vcf.o bcfutils.o prob1.o ld.o kfunc.o index.o fet.o bcf2qcall.o +LOBJS= bcf.o vcf.o bcfutils.o prob1.o em.o kfunc.o kmin.o index.o fet.o bcf2qcall.o OMISC= .. AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o $(OMISC)/bedidx.o PROG= bcftools diff --git a/bcftools/bcf.c b/bcftools/bcf.c index 080b6fe..84a8e76 100644 --- a/bcftools/bcf.c +++ b/bcftools/bcf.c @@ -103,8 +103,14 @@ int bcf_sync(bcf1_t *b) ks_tokaux_t aux; // set ref, alt, flt, info, fmt b->ref = b->alt = b->flt = b->info = b->fmt = 0; - for (p = b->str, n = 0; p < b->str + b->l_str; ++p) - if (*p == 0 && p+1 != b->str + b->l_str) tmp[n++] = p + 1; + for (p = b->str, n = 0; p < b->str + b->l_str; ++p) { + if (*p == 0 && p+1 != b->str + b->l_str) { + if (n == 5) { + ++n; + break; + } else tmp[n++] = p + 1; + } + } if (n != 5) { fprintf(stderr, "[%s] incorrect number of fields (%d != 5) at %d:%d\n", __func__, n, b->tid, b->pos); return -1; diff --git a/bcftools/bcftools.1 b/bcftools/bcftools.1 index 6d11e77..c6b4968 100644 --- a/bcftools/bcftools.1 +++ b/bcftools/bcftools.1 @@ -95,13 +95,18 @@ Uncompressed BCF output (force -b). .B Consensus/Variant Calling Options: .TP 10 .B -c -Call variants. +Call variants using Bayesian inference. This option automatically invokes option +.BR -e . .TP .BI -d \ FLOAT When .B -v is in use, skip loci where the fraction of samples covered by reads is below FLOAT. [0] .TP +.B -e +Perform max-likelihood inference only, including estimating the site allele frequency, +testing Hardy-Weinberg equlibrium and testing associations with LRT. +.TP .B -g Call per-sample genotypes at variant sites (force -c) .TP diff --git a/bcftools/call1.c b/bcftools/call1.c index d61d2c4..8e57aa1 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -25,12 +25,13 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define VC_NO_INDEL 8192 #define VC_ANNO_MAX 16384 #define VC_FIX_PL 32768 +#define VC_EM 0x10000 typedef struct { int flag, prior_type, n1, n_sub, *sublist, n_perm; char *prior_file, **subsam, *fn_dict; uint8_t *ploidy; - double theta, pref, indel_frac, min_perm_p, min_smpl_frac; + double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt; void *bed; } viewconf_t; @@ -38,23 +39,6 @@ void *bed_read(const char *fn); void bed_destroy(void *_h); int bed_overlap(const void *_h, const char *chr, int beg, int end); -static double test_hwe(const double g[3]) -{ - extern double kf_gammaq(double p, double x); - double fexp, chi2, f[3], n; - int i; - n = g[0] + g[1] + g[2]; - fexp = (2. * g[2] + g[1]) / (2. * n); - if (fexp > 1. - 1e-10) fexp = 1. - 1e-10; - if (fexp < 1e-10) fexp = 1e-10; - f[0] = n * (1. - fexp) * (1. - fexp); - f[1] = n * 2. * fexp * (1. - fexp); - f[2] = n * fexp * fexp; - for (i = 0, chi2 = 0.; i < 3; ++i) - chi2 += (g[i] - f[i]) * (g[i] - f[i]) / f[i]; - return kf_gammaq(.5, chi2 / 2.); -} - typedef struct { double p[4]; int mq, depth, is_tested, d[4]; @@ -118,44 +102,53 @@ static void rm_info(bcf1_t *b, const char *key) bcf_sync(b); } -static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag) +static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag, double em[9]) { kstring_t s; - int has_I16, is_var = (pr->p_ref < pref); - double fq, r = is_var? pr->p_ref : pr->p_var; + int has_I16, is_var; + double fq, r; anno16_t a; has_I16 = test16(b, &a) >= 0? 1 : 0; - rm_info(b, "I16="); + rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed! memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); kputs(b->info, &s); if (b->info[0]) kputc(';', &s); -// ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih); - ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g;G3=%.4g,%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih, pr->g[2], pr->g[1], pr->g[0]); - if (n_smpl > 5) { - double hwe = test_hwe(pr->g); - if (hwe < 0.1) ksprintf(&s, ";HWE=%.4g", hwe); + { // print EM + if (em[0] >= 0) ksprintf(&s, "AF1=%.4g", 1 - em[0]); + if (em[4] >= 0 && em[4] <= 0.05) ksprintf(&s, ";G3=%.4g,%.4g,%.4g;HWE=%.3g", em[3], em[2], em[1], em[4]); + if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]); + if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]); } + if (pr == 0) { // if pr is unset, return + kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + return 1; + } + + is_var = (pr->p_ref < pref); + r = is_var? pr->p_ref : pr->p_var; + + ksprintf(&s, ";CI95=%.4g,%.4g", pr->cil, pr->cih); // FIXME: when EM is not used, ";" should be omitted! if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded); if (fq < -999) fq = -999; if (fq > 999) fq = 999; ksprintf(&s, ";FQ=%.3g", fq); if (pr->cmp[0] >= 0.) { // two sample groups - int i, q[3], pq; + int i, q[3]; for (i = 1; i < 3; ++i) { double x = pr->cmp[i] + pr->cmp[0]/2.; q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499); if (q[i] > 255) q[i] = 255; } - pq = (int)(-4.343 * log(pr->p_chi2) + .499); if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank); - ksprintf(&s, ";QCHI2=%d;PCHI2=%.3g;PC2=%d,%d", pq, q[1], q[2], pr->p_chi2); - ksprintf(&s, ";AF2=%.4g,%.4g", 1.-pr->f_em2[0], 1.-pr->f_em2[1]); -// ksprintf(&s, ",%g,%g,%g", pr->cmp[0], pr->cmp[1], pr->cmp[2]); + ksprintf(&s, ";PCHI2=%.3g;PC2=%d,%d", q[1], q[2], pr->p_chi2); } if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); kputc('\0', &s); @@ -175,7 +168,7 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p b->m_str = s.m; b->l_str = s.l; b->str = s.s; bcf_sync(b); for (i = 0; i < b->n_smpl; ++i) { - x = bcf_p1_call_gt(pa, pr->f_em, i); + x = bcf_p1_call_gt(pa, pr->f_exp, i); ((uint8_t*)b->gi[old_n_gi].data)[i] = (x&3) == 0? 1<<3|1 : (x&3) == 1? 1 : 0; ((uint8_t*)b->gi[old_n_gi+1].data)[i] = x>>2; } @@ -270,7 +263,7 @@ static void write_header(bcf_hdr_t *h) h->l_txt = str.l + 1; h->txt = str.s; } -double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); +double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); int bcfview(int argc, char *argv[]) { @@ -291,8 +284,8 @@ int bcfview(int argc, char *argv[]) tid = begin = end = -1; memset(&vc, 0, sizeof(viewconf_t)); - vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; - while ((c = getopt(argc, argv, "FN1:l:cHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:")) >= 0) { + vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; vc.min_lrt = 1; + while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:")) >= 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; case 'l': vc.bed = bed_read(optarg); break; @@ -304,6 +297,7 @@ int bcfview(int argc, char *argv[]) case 'b': vc.flag |= VC_BCFOUT; break; case 'S': vc.flag |= VC_VCFIN; break; case 'c': vc.flag |= VC_CALL; break; + case 'e': vc.flag |= VC_EM; break; case 'v': vc.flag |= VC_VARONLY | VC_CALL; break; case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break; case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break; @@ -315,6 +309,7 @@ int bcfview(int argc, char *argv[]) case 'Q': vc.flag |= VC_QCALL; break; case 'L': vc.flag |= VC_ADJLD; break; case 'U': vc.n_perm = atoi(optarg); break; + case 'C': vc.min_lrt = atof(optarg); break; case 'X': vc.min_perm_p = atof(optarg); break; case 'd': vc.min_smpl_frac = atof(optarg); break; case 's': vc.subsam = read_samples(optarg, &vc.n_sub); @@ -347,8 +342,9 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, " -S input is VCF\n"); fprintf(stderr, " -u uncompressed BCF output (force -b)\n"); fprintf(stderr, "\nConsensus/variant calling options:\n\n"); - fprintf(stderr, " -c SNP calling\n"); + fprintf(stderr, " -c SNP calling (force -e)\n"); fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n"); + fprintf(stderr, " -e likelihood based analyses\n"); fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); fprintf(stderr, " -I skip indels\n"); @@ -358,12 +354,14 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, " -v output potential variant sites only (force -c)\n"); fprintf(stderr, "\nContrast calling and association test options:\n\n"); fprintf(stderr, " -1 INT number of group-1 samples [0]\n"); + fprintf(stderr, " -C FLOAT posterior constrast for LRTBCF conversion please specify the sequence dictionary with -D\n", __func__); return 1; @@ -402,7 +400,7 @@ int bcfview(int argc, char *argv[]) return 1; } } else bcf_p1_init_prior(p1, vc.prior_type, vc.theta); - if (vc.n1 > 0) { + if (vc.n1 > 0 && vc.min_lrt > 0.) { // set n1 bcf_p1_set_n1(p1, vc.n1); bcf_p1_init_subprior(p1, vc.prior_type, vc.theta); } @@ -427,6 +425,7 @@ int bcfview(int argc, char *argv[]) } while (vcf_read(bp, hin, b) > 0) { int is_indel; + double em[9]; if ((vc.flag & VC_VARONLY) && strcmp(b->alt, "X") == 0) continue; if ((vc.flag & VC_VARONLY) && vc.min_smpl_frac > 0.) { extern int bcf_smpl_covered(const bcf1_t *b); @@ -455,10 +454,15 @@ int bcfview(int argc, char *argv[]) bcf_2qcall(hout, b); continue; } + if (vc.flag & VC_EM) bcf_em1(b, vc.n1, 0xff, em); + else { + int i; + for (i = 0; i < 9; ++i) em[i] = -1.; + } if (vc.flag & (VC_CALL|VC_ADJLD)) bcf_gl2pl(b); if (vc.flag & VC_CALL) { // call variants bcf_p1rst_t pr; - bcf_p1_cal(b, p1, &pr); // pr.g[3] is not calculated here + int calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr); if (n_processed % 100000 == 0) { fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed); bcf_p1_dump_afs(p1); @@ -468,17 +472,24 @@ int bcfview(int argc, char *argv[]) bcf_p1rst_t r; int i, n = 0; for (i = 0; i < vc.n_perm; ++i) { +#ifdef BCF_PERM_LRT // LRT based permutation is much faster but less robust to artifacts + double x[9]; + bcf_shuffle(b, seeds[i]); + bcf_em1(b, vc.n1, 1<<7, x); + if (x[7] < em[7]) ++n; +#else bcf_shuffle(b, seeds[i]); - bcf_p1_cal(b, p1, &r); + bcf_p1_cal(b, 1, p1, &r); if (pr.p_chi2 >= r.p_chi2) ++n; +#endif } pr.perm_rank = n; } - update_bcf1(hout->n_smpl, b, p1, &pr, vc.pref, vc.flag); - } + if (calret >= 0) update_bcf1(b, p1, &pr, vc.pref, vc.flag, em); + } else if (vc.flag & VC_EM) update_bcf1(b, 0, 0, 0, vc.flag, em); if (vc.flag & VC_ADJLD) { // compute LD double f[4], r2; - if ((r2 = bcf_ld_freq(blast, b, f)) >= 0) { + if ((r2 = bcf_pair_freq(blast, b, f)) >= 0) { kstring_t s; s.m = s.l = 0; s.s = 0; if (*b->info) kputc(';', &s); diff --git a/bcftools/em.c b/bcftools/em.c new file mode 100644 index 0000000..f5d2fd9 --- /dev/null +++ b/bcftools/em.c @@ -0,0 +1,306 @@ +#include +#include +#include +#include "bcf.h" +#include "kmin.h" + +static double g_q2p[256]; + +#define ITER_MAX 50 +#define ITER_TRY 10 +#define EPS 1e-5 + +extern double kf_gammaq(double, double); + +/* + Generic routines + */ +// get the 3 genotype likelihoods +static double *get_pdg3(const bcf1_t *b) +{ + double *pdg; + const uint8_t *PL = 0; + int i, PL_len = 0; + // initialize g_q2p if necessary + if (g_q2p[0] == 0.) + for (i = 0; i < 256; ++i) + g_q2p[i] = pow(10., -i / 10.); + // set PL and PL_len + for (i = 0; i < b->n_gi; ++i) { + if (b->gi[i].fmt == bcf_str2int("PL", 2)) { + PL = (const uint8_t*)b->gi[i].data; + PL_len = b->gi[i].len; + break; + } + } + if (i == b->n_gi) return 0; // no PL + // fill pdg + pdg = malloc(3 * b->n_smpl * sizeof(double)); + for (i = 0; i < b->n_smpl; ++i) { + const uint8_t *pi = PL + i * PL_len; + double *p = pdg + i * 3; + p[0] = g_q2p[pi[2]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]]; + } + return pdg; +} + +// estimate site allele frequency in a very naive and inaccurate way +static double est_freq(int n, const double *pdg) +{ + int i, gcnt[3], tmp1; + // get a rough estimate of the genotype frequency + gcnt[0] = gcnt[1] = gcnt[2] = 0; + for (i = 0; i < n; ++i) { + const double *p = pdg + i * 3; + if (p[0] != 1. || p[1] != 1. || p[2] != 1.) { + int which = p[0] > p[1]? 0 : 1; + which = p[which] > p[2]? which : 2; + ++gcnt[which]; + } + } + tmp1 = gcnt[0] + gcnt[1] + gcnt[2]; + return (tmp1 == 0)? -1.0 : (.5 * gcnt[1] + gcnt[2]) / tmp1; +} + +/* + Single-locus EM + */ + +typedef struct { + int beg, end; + const double *pdg; +} minaux1_t; + +static double prob1(double f, void *data) +{ + minaux1_t *a = (minaux1_t*)data; + double p = 1., l = 0., f3[3]; + int i; +// printf("brent %lg\n", f); + if (f < 0 || f > 1) return 1e300; + f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f; + for (i = a->beg; i < a->end; ++i) { + const double *pdg = a->pdg + i * 3; + p *= pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]; + if (p < 1e-200) l -= log(p), p = 1.; + } + return l - log(p); +} + +// one EM iteration for allele frequency estimate +static double freq_iter(double *f, const double *_pdg, int beg, int end) +{ + double f0 = *f, f3[3], err; + int i; +// printf("em %lg\n", *f); + f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + for (i = beg, f0 = 0.; i < end; ++i) { + const double *pdg = _pdg + i * 3; + f0 += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2]) + / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]); + } + f0 /= (end - beg) * 2; + err = fabs(f0 - *f); + *f = f0; + return err; +} + +/* The following function combines EM and Brent's method. When the signal from + * the data is strong, EM is faster but sometimes, EM may converge very slowly. + * When this happens, we switch to Brent's method. The idea is learned from + * Rasmus Nielsen. + */ +static double freqml(double f0, int beg, int end, const double *pdg) +{ + int i; + double f; + for (i = 0, f = f0; i < ITER_TRY; ++i) + if (freq_iter(&f, pdg, beg, end) < EPS) break; + if (i == ITER_TRY) { // haven't converged yet; try Brent's method + minaux1_t a; + a.beg = beg; a.end = end; a.pdg = pdg; + kmin_brent(prob1, f0 == f? .5*f0 : f0, f, (void*)&a, EPS, &f); + } + return f; +} + +// one EM iteration for genotype frequency estimate +static double g3_iter(double g[3], const double *_pdg, int beg, int end) +{ + double err, gg[3]; + int i; + gg[0] = gg[1] = gg[2] = 0.; +// printf("%lg,%lg,%lg\n", g[0], g[1], g[2]); + for (i = beg; i < end; ++i) { + double sum, tmp[3]; + const double *pdg = _pdg + i * 3; + tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2]; + sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg); + gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum; + } + err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]); + err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]); + g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2]; + return err; +} + +// perform likelihood ratio test +static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3]) +{ + double r; + int i; + for (i = 0, r = 1.; i < n1; ++i) { + const double *p = pdg + i * 3; + r *= (p[0] * f3[1][0] + p[1] * f3[1][1] + p[2] * f3[1][2]) + / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]); + } + for (; i < n; ++i) { + const double *p = pdg + i * 3; + r *= (p[0] * f3[2][0] + p[1] * f3[2][1] + p[2] * f3[2][2]) + / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]); + } + return r; +} + +// x[0]: ref frequency +// x[1..3]: alt-alt, alt-ref, ref-ref frequenc +// x[4]: HWE P-value +// x[5..6]: group1 freq, group2 freq +// x[7]: 1-degree P-value +// x[8]: 2-degree P-value +int bcf_em1(const bcf1_t *b, int n1, int flag, double x[9]) +{ + double *pdg; + int i, n, n2; + if (b->n_alleles < 2) return -1; // one allele only + // initialization + if (n1 < 0 || n1 > b->n_smpl) n1 = 0; + if (flag & 1<<7) flag |= 7<<5; // compute group freq if LRT is required + if (flag & 0xf<<1) flag |= 0xf<<1; + n = b->n_smpl; n2 = n - n1; + pdg = get_pdg3(b); + if (pdg == 0) return -1; + for (i = 0; i < 9; ++i) x[i] = -1.; + { + if ((x[0] = est_freq(n, pdg)) < 0.) { + free(pdg); + return -1; // no data + } + x[0] = freqml(x[0], 0, n, pdg); + } + if (flag & (0xf<<1|1<<8)) { // estimate the genotype frequency and test HWE + double *g = x + 1, f3[3], r; + f3[0] = g[0] = (1 - x[0]) * (1 - x[0]); + f3[1] = g[1] = 2 * x[0] * (1 - x[0]); + f3[2] = g[2] = x[0] * x[0]; + for (i = 0; i < ITER_MAX; ++i) + if (g3_iter(g, pdg, 0, n) < EPS) break; + // Hardy-Weinberg equilibrium (HWE) + for (i = 0, r = 1.; i < n; ++i) { + double *p = pdg + i * 3; + r *= (p[0] * g[0] + p[1] * g[1] + p[2] * g[2]) / (p[0] * f3[0] + p[1] * f3[1] + p[2] * f3[2]); + } + x[4] = kf_gammaq(.5, log(r)); + } + if ((flag & 7<<5) && n1 > 0 && n1 < n) { // group frequency + x[5] = freqml(x[0], 0, n1, pdg); + x[6] = freqml(x[0], n1, n, pdg); + } + if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value + double f[3], f3[3][3]; + f[0] = x[0]; f[1] = x[5]; f[2] = x[6]; + for (i = 0; i < 3; ++i) + f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i]; + x[7] = kf_gammaq(.5, log(lk_ratio_test(n, n1, pdg, f3))); + } + if ((flag & 1<<8) && n1 > 0 && n1 < n) { // 2-degree P-value + double g[3][3]; + for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double)); + for (i = 0; i < ITER_MAX; ++i) + if (g3_iter(g[1], pdg, 0, n1) < EPS) break; + for (i = 0; i < ITER_MAX; ++i) + if (g3_iter(g[2], pdg, n1, n) < EPS) break; + x[8] = kf_gammaq(1., log(lk_ratio_test(n, n1, pdg, g))); + } + // free + free(pdg); + return 0; +} + +/* + Two-locus EM (LD) + */ + +#define _G1(h, k) ((h>>1&1) + (k>>1&1)) +#define _G2(h, k) ((h&1) + (k&1)) + +// 0: the previous site; 1: the current site +static int pair_freq_iter(int n, double *pdg[2], double f[4]) +{ + double ff[4]; + int i, k, h; +// printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]); + memset(ff, 0, 4 * sizeof(double)); + for (i = 0; i < n; ++i) { + double *p[2], sum, tmp; + p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3; + for (k = 0, sum = 0.; k < 4; ++k) + for (h = 0; h < 4; ++h) + sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)]; + for (k = 0; k < 4; ++k) { + tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)]) + + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)]) + + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)]) + + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]); + ff[k] += f[k] * tmp / sum; + } + } + for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n); + return 0; +} + +double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]) +{ + const bcf1_t *b[2]; + int i, j, n_smpl; + double *pdg[2], flast[4], r, f0[2]; + // initialize others + if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples + n_smpl = b0->n_smpl; + b[0] = b0; b[1] = b1; + f[0] = f[1] = f[2] = f[3] = -1.; + if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only + pdg[0] = get_pdg3(b0); pdg[1] = get_pdg3(b1); + if (pdg[0] == 0 || pdg[1] == 0) { + free(pdg[0]); free(pdg[1]); + return -1; + } + // set the initial value + f0[0] = est_freq(n_smpl, pdg[0]); + f0[1] = est_freq(n_smpl, pdg[1]); + f[0] = (1 - f0[0]) * (1 - f0[1]); f[3] = f0[0] * f0[1]; + f[1] = (1 - f0[0]) * f0[1]; f[2] = f0[0] * (1 - f0[1]); + // iteration + for (j = 0; j < ITER_MAX; ++j) { + double eps = 0; + memcpy(flast, f, 4 * sizeof(double)); + pair_freq_iter(n_smpl, pdg, f); + for (i = 0; i < 4; ++i) { + double x = fabs(f[i] - flast[i]); + if (x > eps) eps = x; + } + if (eps < EPS) break; + } + // free + free(pdg[0]); free(pdg[1]); + { // calculate r^2 + double p[2], q[2], D; + p[0] = f[0] + f[1]; q[0] = 1 - p[0]; + p[1] = f[0] + f[2]; q[1] = 1 - p[1]; + D = f[0] * f[3] - f[1] * f[2]; + r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1])); +// printf("R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r); + if (isnan(r)) r = -1.; + } + return r; +} diff --git a/bcftools/kmin.c b/bcftools/kmin.c new file mode 100644 index 0000000..5b8193b --- /dev/null +++ b/bcftools/kmin.c @@ -0,0 +1,209 @@ +/* The MIT License + + Copyright (c) 2008, 2010 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Hooke-Jeeves algorithm for nonlinear minimization + + Based on the pseudocodes by Bell and Pike (CACM 9(9):684-685), and + the revision by Tomlin and Smith (CACM 12(11):637-638). Both of the + papers are comments on Kaupe's Algorithm 178 "Direct Search" (ACM + 6(6):313-314). The original algorithm was designed by Hooke and + Jeeves (ACM 8:212-229). This program is further revised according to + Johnson's implementation at Netlib (opt/hooke.c). + + Hooke-Jeeves algorithm is very simple and it works quite well on a + few examples. However, it might fail to converge due to its heuristic + nature. A possible improvement, as is suggested by Johnson, may be to + choose a small r at the beginning to quickly approach to the minimum + and a large r at later step to hit the minimum. + */ + +#include +#include +#include +#include "kmin.h" + +static double __kmin_hj_aux(kmin_f func, int n, double *x1, void *data, double fx1, double *dx, int *n_calls) +{ + int k, j = *n_calls; + double ftmp; + for (k = 0; k != n; ++k) { + x1[k] += dx[k]; + ftmp = func(n, x1, data); ++j; + if (ftmp < fx1) fx1 = ftmp; + else { /* search the opposite direction */ + dx[k] = 0.0 - dx[k]; + x1[k] += dx[k] + dx[k]; + ftmp = func(n, x1, data); ++j; + if (ftmp < fx1) fx1 = ftmp; + else x1[k] -= dx[k]; /* back to the original x[k] */ + } + } + *n_calls = j; + return fx1; /* here: fx1=f(n,x1) */ +} + +double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls) +{ + double fx, fx1, *x1, *dx, radius; + int k, n_calls = 0; + x1 = (double*)calloc(n, sizeof(double)); + dx = (double*)calloc(n, sizeof(double)); + for (k = 0; k != n; ++k) { /* initial directions, based on MGJ */ + dx[k] = fabs(x[k]) * r; + if (dx[k] == 0) dx[k] = r; + } + radius = r; + fx1 = fx = func(n, x, data); ++n_calls; + for (;;) { + memcpy(x1, x, n * sizeof(double)); /* x1 = x */ + fx1 = __kmin_hj_aux(func, n, x1, data, fx, dx, &n_calls); + while (fx1 < fx) { + for (k = 0; k != n; ++k) { + double t = x[k]; + dx[k] = x1[k] > x[k]? fabs(dx[k]) : 0.0 - fabs(dx[k]); + x[k] = x1[k]; + x1[k] = x1[k] + x1[k] - t; + } + fx = fx1; + if (n_calls >= max_calls) break; + fx1 = func(n, x1, data); ++n_calls; + fx1 = __kmin_hj_aux(func, n, x1, data, fx1, dx, &n_calls); + if (fx1 >= fx) break; + for (k = 0; k != n; ++k) + if (fabs(x1[k] - x[k]) > .5 * fabs(dx[k])) break; + if (k == n) break; + } + if (radius >= eps) { + if (n_calls >= max_calls) break; + radius *= r; + for (k = 0; k != n; ++k) dx[k] *= r; + } else break; /* converge */ + } + free(x1); free(dx); + return fx1; +} + +// I copied this function somewhere several years ago with some of my modifications, but I forgot the source. +double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin) +{ + double bound, u, r, q, fu, tmp, fa, fb, fc, c; + const double gold1 = 1.6180339887; + const double gold2 = 0.3819660113; + const double tiny = 1e-20; + const int max_iter = 100; + + double e, d, w, v, mid, tol1, tol2, p, eold, fv, fw; + int iter; + + fa = func(a, data); fb = func(b, data); + if (fb > fa) { // swap, such that f(a) > f(b) + tmp = a; a = b; b = tmp; + tmp = fa; fa = fb; fb = tmp; + } + c = b + gold1 * (b - a), fc = func(c, data); // golden section extrapolation + while (fb > fc) { + bound = b + 100.0 * (c - b); // the farthest point where we want to go + r = (b - a) * (fb - fc); + q = (b - c) * (fb - fa); + if (fabs(q - r) < tiny) { // avoid 0 denominator + tmp = q > r? tiny : 0.0 - tiny; + } else tmp = q - r; + u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp); // u is the parabolic extrapolation point + if ((b > u && u > c) || (b < u && u < c)) { // u lies between b and c + fu = func(u, data); + if (fu < fc) { // (b,u,c) bracket the minimum + a = b; b = u; fa = fb; fb = fu; + break; + } else if (fu > fb) { // (a,b,u) bracket the minimum + c = u; fc = fu; + break; + } + u = c + gold1 * (c - b); fu = func(u, data); // golden section extrapolation + } else if ((c > u && u > bound) || (c < u && u < bound)) { // u lies between c and bound + fu = func(u, data); + if (fu < fc) { // fb > fc > fu + b = c; c = u; u = c + gold1 * (c - b); + fb = fc; fc = fu; fu = func(u, data); + } else { // (b,c,u) bracket the minimum + a = b; b = c; c = u; + fa = fb; fb = fc; fc = fu; + break; + } + } else if ((u > bound && bound > c) || (u < bound && bound < c)) { // u goes beyond the bound + u = bound; fu = func(u, data); + } else { // u goes the other way around, use golden section extrapolation + u = c + gold1 * (c - b); fu = func(u, data); + } + a = b; b = c; c = u; + fa = fb; fb = fc; fc = fu; + } + if (a > c) u = a, a = c, c = u; // swap + + // now, afb and fb tol1) { + // related to parabolic interpolation + r = (b - w) * (fb - fv); + q = (b - v) * (fb - fw); + p = (b - v) * q - (b - w) * r; + q = 2.0 * (q - r); + if (q > 0.0) p = 0.0 - p; + else q = 0.0 - q; + eold = e; e = d; + if (fabs(p) >= fabs(0.5 * q * eold) || p <= q * (a - b) || p >= q * (c - b)) { + d = gold2 * (e = (b >= mid ? a - b : c - b)); + } else { + d = p / q; u = b + d; // actual parabolic interpolation happens here + if (u - a < tol2 || c - u < tol2) + d = (mid > b)? tol1 : 0.0 - tol1; + } + } else d = gold2 * (e = (b >= mid ? a - b : c - b)); // golden section interpolation + u = fabs(d) >= tol1 ? b + d : b + (d > 0.0? tol1 : -tol1); + fu = func(u, data); + if (fu <= fb) { // u is the minimum point so far + if (u >= b) a = b; + else c = b; + v = w; w = b; b = u; fv = fw; fw = fb; fb = fu; + } else { // adjust (a,c) and (u,v,w) + if (u < b) a = u; + else c = u; + if (fu <= fw || w == b) { + v = w; w = u; + fv = fw; fw = fu; + } else if (fu <= fv || v == b || v == w) { + v = u; fv = fu; + } + } + } + *xmin = b; + return fb; +} diff --git a/bcftools/kmin.h b/bcftools/kmin.h new file mode 100644 index 0000000..6feba45 --- /dev/null +++ b/bcftools/kmin.h @@ -0,0 +1,46 @@ +/* + Copyright (c) 2008, 2010 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef KMIN_H +#define KMIN_H + +#define KMIN_RADIUS 0.5 +#define KMIN_EPS 1e-7 +#define KMIN_MAXCALL 50000 + +typedef double (*kmin_f)(int, double*, void*); +typedef double (*kmin1_f)(double, void*); + +#ifdef __cplusplus +extern "C" { +#endif + + double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls); + double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bcftools/ld.c b/bcftools/ld.c deleted file mode 100644 index 0207819..0000000 --- a/bcftools/ld.c +++ /dev/null @@ -1,143 +0,0 @@ -#include -#include -#include -#include "bcf.h" - -static double g_q2p[256]; - -#define LD_ITER_MAX 50 -#define LD_ITER_EPS 1e-4 - -#define _G1(h, k) ((h>>1&1) + (k>>1&1)) -#define _G2(h, k) ((h&1) + (k&1)) - -// 0: the previous site; 1: the current site -static int freq_iter(int n, double *pdg[2], double f[4]) -{ - double ff[4]; - int i, k, h; - memset(ff, 0, 4 * sizeof(double)); - for (i = 0; i < n; ++i) { - double *p[2], sum, tmp; - p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3; - for (k = 0, sum = 0.; k < 4; ++k) - for (h = 0; h < 4; ++h) - sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)]; - for (k = 0; k < 4; ++k) { - tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)]) - + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)]) - + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)]) - + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]); - ff[k] += f[k] * tmp / sum; - } - } - for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n); - return 0; -} - -double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]) -{ - const bcf1_t *b[2]; - uint8_t *PL[2]; - int i, j, PL_len[2], n_smpl; - double *pdg[2], flast[4], r; - // initialize g_q2p if necessary - if (g_q2p[0] == 0.) - for (i = 0; i < 256; ++i) - g_q2p[i] = pow(10., -i / 10.); - // initialize others - if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples - n_smpl = b0->n_smpl; - b[0] = b0; b[1] = b1; - f[0] = f[1] = f[2] = f[3] = -1.; - if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only - // set PL and PL_len - for (j = 0; j < 2; ++j) { - const bcf1_t *bj = b[j]; - for (i = 0; i < bj->n_gi; ++i) { - if (bj->gi[i].fmt == bcf_str2int("PL", 2)) { - PL[j] = (uint8_t*)bj->gi[i].data; - PL_len[j] = bj->gi[i].len; - break; - } - } - if (i == bj->n_gi) return -1; // no PL - } - // fill pdg[2] - pdg[0] = malloc(3 * n_smpl * sizeof(double)); - pdg[1] = malloc(3 * n_smpl * sizeof(double)); - for (j = 0; j < 2; ++j) { - for (i = 0; i < n_smpl; ++i) { - const uint8_t *pi = PL[j] + i * PL_len[j]; - double *p = pdg[j] + i * 3; - p[0] = g_q2p[pi[2]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]]; - } - } - // iteration - f[0] = f[1] = f[2] = f[3] = 0.25; // this is a really bad guess... - for (j = 0; j < LD_ITER_MAX; ++j) { - double eps = 0; - memcpy(flast, f, 4 * sizeof(double)); - freq_iter(n_smpl, pdg, f); - for (i = 0; i < 4; ++i) { - double x = fabs(f[i] - flast[i]); - if (x > eps) eps = x; - } - if (eps < LD_ITER_EPS) break; - } - // free - free(pdg[0]); free(pdg[1]); - { // calculate r^2 - double p[2], q[2], D; - p[0] = f[0] + f[1]; q[0] = 1 - p[0]; - p[1] = f[0] + f[2]; q[1] = 1 - p[1]; - D = f[0] * f[3] - f[1] * f[2]; - r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1])); - // fprintf(stderr, "R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r2); - if (isnan(r)) r = -1.; - } - return r; -} - -int bcf_main_pwld(int argc, char *argv[]) -{ - bcf_t *fp; - bcf_hdr_t *h; - bcf1_t **b, *b0; - int i, j, m, n; - double f[4]; - if (argc == 1) { - fprintf(stderr, "Usage: bcftools pwld \n"); - return 1; - } - fp = bcf_open(argv[1], "rb"); - h = bcf_hdr_read(fp); - // read the entire BCF - m = n = 0; b = 0; - b0 = calloc(1, sizeof(bcf1_t)); - while (bcf_read(fp, h, b0) >= 0) { - if (m == n) { - m = m? m<<1 : 16; - b = realloc(b, sizeof(void*) * m); - } - b[n] = calloc(1, sizeof(bcf1_t)); - bcf_cpy(b[n++], b0); - } - bcf_destroy(b0); - // compute pair-wise r^2 - printf("%d\n", n); // the number of loci - for (i = 0; i < n; ++i) { - printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1); - for (j = 0; j < i; ++j) { - double r = bcf_ld_freq(b[i], b[j], f); - printf("\t%.3f", r*r); - } - printf("\t1.000\n"); - } - // free - for (i = 0; i < n; ++i) bcf_destroy(b[i]); - free(b); - bcf_hdr_destroy(h); - bcf_close(fp); - return 0; -} diff --git a/bcftools/main.c b/bcftools/main.c index 75125f8..7293374 100644 --- a/bcftools/main.c +++ b/bcftools/main.c @@ -1,11 +1,14 @@ #include #include #include +#include #include "bcf.h" +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 0x10000) + int bcfview(int argc, char *argv[]); int bcf_main_index(int argc, char *argv[]); -int bcf_main_pwld(int argc, char *argv[]); #define BUF_SIZE 0x10000 @@ -43,6 +46,122 @@ int bcf_cat(int n, char * const *fn) return 0; } +extern double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); + +int bcf_main_ldpair(int argc, char *argv[]) +{ + bcf_t *fp; + bcf_hdr_t *h; + bcf1_t *b0, *b1; + bcf_idx_t *idx; + kstring_t str; + void *str2id; + gzFile fplist; + kstream_t *ks; + int dret, lineno = 0; + if (argc < 3) { + fprintf(stderr, "Usage: bcftools ldpair \n"); + return 1; + } + fplist = gzopen(argv[2], "rb"); + ks = ks_init(fplist); + memset(&str, 0, sizeof(kstring_t)); + fp = bcf_open(argv[1], "rb"); + h = bcf_hdr_read(fp); + str2id = bcf_build_refhash(h); + idx = bcf_idx_load(argv[1]); + if (idx == 0) { + fprintf(stderr, "[%s] No bcf index is found. Abort!\n", __func__); + return 1; + } + b0 = calloc(1, sizeof(bcf1_t)); + b1 = calloc(1, sizeof(bcf1_t)); + while (ks_getuntil(ks, '\n', &str, &dret) >= 0) { + char *p, *q; + int k; + int tid0 = -1, tid1 = -1, pos0 = -1, pos1 = -1; + ++lineno; + for (p = q = str.s, k = 0; *p; ++p) { + if (*p == ' ' || *p == '\t') { + *p = '\0'; + if (k == 0) tid0 = bcf_str2id(str2id, q); + else if (k == 1) pos0 = atoi(q) - 1; + else if (k == 2) tid1 = strcmp(q, "=")? bcf_str2id(str2id, q) : tid0; + else if (k == 3) pos1 = atoi(q) - 1; + q = p + 1; + ++k; + } + } + if (k == 3) pos1 = atoi(q) - 1; + if (tid0 >= 0 && tid1 >= 0 && pos0 >= 0 && pos1 >= 0) { + uint64_t off; + double r, f[4]; + off = bcf_idx_query(idx, tid0, pos0); + bgzf_seek(fp->fp, off, SEEK_SET); + while (bcf_read(fp, h, b0) >= 0 && b0->pos != pos0); + off = bcf_idx_query(idx, tid1, pos1); + bgzf_seek(fp->fp, off, SEEK_SET); + while (bcf_read(fp, h, b1) >= 0 && b1->pos != pos1); + r = bcf_pair_freq(b0, b1, f); + r *= r; + printf("%s\t%d\t%s\t%d\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\n", h->ns[tid0], pos0+1, h->ns[tid1], pos1+1, + r, f[0], f[1], f[2], f[3]); + } //else fprintf(stderr, "[%s] Parse error at line %d.\n", __func__, lineno); + } + bcf_destroy(b0); bcf_destroy(b1); + bcf_idx_destroy(idx); + bcf_str2id_destroy(str2id); + bcf_hdr_destroy(h); + bcf_close(fp); + free(str.s); + ks_destroy(ks); + gzclose(fplist); + return 0; +} + +int bcf_main_ld(int argc, char *argv[]) +{ + bcf_t *fp; + bcf_hdr_t *h; + bcf1_t **b, *b0; + int i, j, m, n; + double f[4]; + if (argc == 1) { + fprintf(stderr, "Usage: bcftools ld \n"); + return 1; + } + fp = bcf_open(argv[1], "rb"); + h = bcf_hdr_read(fp); + // read the entire BCF + m = n = 0; b = 0; + b0 = calloc(1, sizeof(bcf1_t)); + while (bcf_read(fp, h, b0) >= 0) { + if (m == n) { + m = m? m<<1 : 16; + b = realloc(b, sizeof(void*) * m); + } + b[n] = calloc(1, sizeof(bcf1_t)); + bcf_cpy(b[n++], b0); + } + bcf_destroy(b0); + // compute pair-wise r^2 + printf("%d\n", n); // the number of loci + for (i = 0; i < n; ++i) { + printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1); + for (j = 0; j < i; ++j) { + double r = bcf_pair_freq(b[i], b[j], f); + printf("\t%.3f", r*r); + } + printf("\t1.000\n"); + } + // free + for (i = 0; i < n; ++i) bcf_destroy(b[i]); + free(b); + bcf_hdr_destroy(h); + bcf_close(fp); + return 0; +} + int main(int argc, char *argv[]) { if (argc == 1) { @@ -52,12 +171,14 @@ int main(int argc, char *argv[]) fprintf(stderr, " index index BCF\n"); fprintf(stderr, " cat concatenate BCFs\n"); fprintf(stderr, " ld compute all-pair r^2\n"); + fprintf(stderr, " ldpair compute r^2 between requested pairs\n"); fprintf(stderr, "\n"); return 1; } if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1); - else if (strcmp(argv[1], "ld") == 0) return bcf_main_pwld(argc-1, argv+1); + else if (strcmp(argv[1], "ld") == 0) return bcf_main_ld(argc-1, argv+1); + else if (strcmp(argv[1], "ldpair") == 0) return bcf_main_ldpair(argc-1, argv+1); else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); // cat is different ... else { fprintf(stderr, "[main] Unrecognized command.\n"); diff --git a/bcftools/prob1.c b/bcftools/prob1.c index a024d04..fc9cb29 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -40,6 +40,7 @@ struct __bcf_p1aux_t { double *z, *zswap; // aux for afs double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set double **hg; // hypergeometric distribution + double *lf; // log factorial double t, t1, t2; double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution const uint8_t *PL; // point to PL @@ -154,8 +155,10 @@ bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy) ma->z2 = calloc(ma->M + 1, sizeof(double)); ma->afs = calloc(ma->M + 1, sizeof(double)); ma->afs1 = calloc(ma->M + 1, sizeof(double)); + ma->lf = calloc(ma->M + 1, sizeof(double)); for (i = 0; i < 256; ++i) ma->q2p[i] = pow(10., -i / 10.); + for (i = 0; i <= ma->M; ++i) ma->lf[i] = lgamma(i + 1); bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior return ma; } @@ -175,6 +178,7 @@ void bcf_p1_destroy(bcf_p1aux_t *ma) { if (ma) { int k; + free(ma->lf); if (ma->hg && ma->n1 > 0) { for (k = 0; k <= 2*ma->n1; ++k) free(ma->hg[k]); free(ma->hg); @@ -208,39 +212,6 @@ static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) if ((p[i]&0xf) == 0) break; return i; } -// f0 is the reference allele frequency -static double mc_freq_iter(double f0, const bcf_p1aux_t *ma, int beg, int end) -{ - double f, f3[3]; - int i; - f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; - for (i = beg, f = 0.; i < end; ++i) { - double *pdg; - pdg = ma->pdg + i * 3; - f += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2]) - / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]); - } - f /= (end - beg) * 2.; - return f; -} - -static double mc_gtfreq_iter(double g[3], const bcf_p1aux_t *ma, int beg, int end) -{ - double err, gg[3]; - int i; - gg[0] = gg[1] = gg[2] = 0.; - for (i = beg; i < end; ++i) { - double *pdg, sum, tmp[3]; - pdg = ma->pdg + i * 3; - tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2]; - sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg); - gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum; - } - err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]); - err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]); - g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2]; - return err; -} int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) { @@ -385,9 +356,10 @@ static inline double chi2_test(int a, int b, int c, int d) } // chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)] -static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int n1, int n2, int k1, int k2, double x[3]) +static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int k1, int k2, double x[3]) { double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2]; + int n1 = p1->n1, n2 = p1->n - p1->n1; if (p < CONTRAST_TINY) return -1; if (.5*k1/n1 < .5*k2/n2) x[1] += p; else if (.5*k1/n1 > .5*k2/n2) x[2] += p; @@ -415,12 +387,12 @@ static double contrast2(bcf_p1aux_t *p1, double ret[3]) p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp)); } } - { // compute sum1 and sum2 + { // compute long double suml = 0; for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k]; sum = suml; } - { // get the mean k1 and k2 + { // get the max k1 and k2 double max; int max_k; for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) { @@ -436,15 +408,15 @@ static double contrast2(bcf_p1aux_t *p1, double ret[3]) } { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N. double x[3], y; - long double z = 0.; - x[0] = x[1] = x[2] = 0; + long double z = 0., L[2]; + x[0] = x[1] = x[2] = 0; L[0] = L[1] = 0; for (k1 = k10; k1 >= 0; --k1) { for (k2 = k20; k2 >= 0; --k2) { - if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break; + if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; else z += y; } for (k2 = k20 + 1; k2 <= 2*n2; ++k2) { - if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break; + if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; else z += y; } } @@ -452,21 +424,21 @@ static double contrast2(bcf_p1aux_t *p1, double ret[3]) x[0] = x[1] = x[2] = 0; for (k1 = k10 + 1; k1 <= 2*n1; ++k1) { for (k2 = k20; k2 >= 0; --k2) { - if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break; + if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; else z += y; } for (k2 = k20 + 1; k2 <= 2*n2; ++k2) { - if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break; + if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; else z += y; } } ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2]; - if (ret[0] + ret[1] + ret[2] < 0.99) { // in case of bad things happened - ret[0] = ret[1] = ret[2] = 0; + if (ret[0] + ret[1] + ret[2] < 0.95) { // in case of bad things happened + ret[0] = ret[1] = ret[2] = 0; L[0] = L[1] = 0; for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1) for (k2 = 0; k2 <= 2*n2; ++k2) - if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, ret)) >= 0) z += y; - if (ret[0] + ret[1] + ret[2] < 0.99) // It seems that this may be caused by floating point errors. I do not really understand why... + if ((y = contrast2_aux(p1, sum, k1, k2, ret)) >= 0) z += y; + if (ret[0] + ret[1] + ret[2] < 0.95) // It seems that this may be caused by floating point errors. I do not really understand why... z = 1.0, ret[0] = ret[1] = ret[2] = 1./3; } return (double)z; @@ -502,7 +474,7 @@ static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_fo return sum / ma->M; } -int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) +int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst) { int i, k; long double sum = 0.; @@ -516,6 +488,7 @@ int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) break; } } + if (i == b->n_gi) return -1; // no PL if (b->n_alleles < 2) return -1; // FIXME: find a better solution // rst->rank0 = cal_pdg(b, ma); @@ -533,31 +506,6 @@ int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) rst->f_flat += k * p; } rst->f_flat /= ma->M; - { // calculate f_em - double flast = rst->f_flat; - for (i = 0; i < MC_MAX_EM_ITER; ++i) { - rst->f_em = mc_freq_iter(flast, ma, 0, ma->n); - if (fabs(rst->f_em - flast) < MC_EM_EPS) break; - flast = rst->f_em; - } - if (ma->n1 > 0 && ma->n1 < ma->n) { - for (k = 0; k < 2; ++k) { - flast = rst->f_em; - for (i = 0; i < MC_MAX_EM_ITER; ++i) { - rst->f_em2[k] = k? mc_freq_iter(flast, ma, ma->n1, ma->n) : mc_freq_iter(flast, ma, 0, ma->n1); - if (fabs(rst->f_em2[k] - flast) < MC_EM_EPS) break; - flast = rst->f_em2[k]; - } - } - } - } - { // compute g[3] - rst->g[0] = (1. - rst->f_em) * (1. - rst->f_em); - rst->g[1] = 2. * rst->f_em * (1. - rst->f_em); - rst->g[2] = rst->f_em * rst->f_em; - for (i = 0; i < MC_MAX_EM_ITER; ++i) - if (mc_gtfreq_iter(rst->g, ma, 0, ma->n) < MC_EM_EPS) break; - } { // estimate equal-tail credible interval (95% level) int l, h; double p; @@ -572,7 +520,7 @@ int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M; } rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0; - if (rst->p_var > 0.1) // skip contrast2() if the locus is a strong non-variant + if (do_contrast && rst->p_var > 0.5) // skip contrast2() if the locus is a strong non-variant rst->p_chi2 = contrast2(ma, rst->cmp); return 0; } diff --git a/bcftools/prob1.h b/bcftools/prob1.h index b824c3c..571f42f 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -8,10 +8,9 @@ typedef struct __bcf_p1aux_t bcf_p1aux_t; typedef struct { int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal() - double f_em, f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var; + double f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var; double cil, cih; - double cmp[3], p_chi2, f_em2[2]; // used by contrast2() - double g[3]; + double cmp[3], p_chi2; // used by contrast2() } bcf_p1rst_t; #define MC_PTYPE_FULL 1 @@ -26,13 +25,15 @@ extern "C" { void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_destroy(bcf_p1aux_t *ma); - int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst); + int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst); int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); void bcf_p1_dump_afs(bcf_p1aux_t *ma); int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); int bcf_p1_set_n1(bcf_p1aux_t *b, int n1); void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called + int bcf_em1(const bcf1_t *b, int n1, int flag, double x[9]); + #ifdef __cplusplus } #endif diff --git a/bedidx.c b/bedidx.c index 9297831..722877d 100644 --- a/bedidx.c +++ b/bedidx.c @@ -4,6 +4,9 @@ #include #include +#include "ksort.h" +KSORT_INIT_GENERIC(uint64_t) + #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 8192) @@ -53,6 +56,7 @@ void bed_index(void *_h) if (kh_exist(h, k)) { bed_reglist_t *p = &kh_val(h, k); if (p->idx) free(p->idx); + ks_introsort(uint64_t, p->n, p->a); p->idx = bed_index_core(p->n, p->a, &p->m); } } diff --git a/examples/toy.sam b/examples/toy.sam index 1aff220..33449b1 100644 --- a/examples/toy.sam +++ b/examples/toy.sam @@ -1,6 +1,6 @@ @SQ SN:ref LN:45 @SQ SN:ref2 LN:40 -r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * +r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 r002 0 ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * r003 0 ref 9 30 5H6M * 0 0 AGCTAA * r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * diff --git a/sam_header.c b/sam_header.c index f49b2d4..f4c8a3b 100644 --- a/sam_header.c +++ b/sam_header.c @@ -38,7 +38,7 @@ const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL}; const char *r_sq_tags[] = {"SN","LN",NULL}; const char *u_sq_tags[] = {"SN",NULL}; -const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL}; +const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL}; const char *r_rg_tags[] = {"ID",NULL}; const char *u_rg_tags[] = {"ID",NULL}; diff --git a/samtools.1 b/samtools.1 index ba32392..c71fc87 100644 --- a/samtools.1 +++ b/samtools.1 @@ -1,4 +1,4 @@ -.TH samtools 1 "16 March 2011" "samtools-0.1.14" "Bioinformatics tools" +.TH samtools 1 "21 April 2011" "samtools-0.1.16" "Bioinformatics tools" .SH NAME .PP samtools - Utilities for the Sequence Alignment/Map (SAM) format @@ -285,7 +285,7 @@ Approximately the maximum required memory. [500000000] .TP .B merge -samtools merge [-nur] [-h inh.sam] [-R reg] [...] +samtools merge [-nur1f] [-h inh.sam] [-R reg] [...] Merge multiple sorted alignments. The header reference lists of all the input BAM files, and the @SQ headers of @@ -302,6 +302,12 @@ and the headers of other files will be ignored. .B OPTIONS: .RS .TP 8 +.B -1 +Use zlib compression level 1 to comrpess the output +.TP +.B -f +Force to overwrite the output file if present. +.TP 8 .BI -h \ FILE Use the lines of .I FILE @@ -313,17 +319,18 @@ replacing any header lines that would otherwise be copied from is actually in SAM format, though any alignment records it may contain are ignored.) .TP +.B -n +The input alignments are sorted by read names rather than by chromosomal +coordinates +.TP .BI -R \ STR Merge files in the specified region indicated by .I STR +[null] .TP .B -r Attach an RG tag to each alignment. The tag value is inferred from file names. .TP -.B -n -The input alignments are sorted by read names rather than by chromosomal -coordinates -.TP .B -u Uncompressed BAM output .RE