X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=blobdiff_plain;f=bcftools%2Fprob1.c;h=193c4a0c535126496b254869a6aee193ac4ebbf6;hp=e3b0f7344d096f22c37048b2131219378bcb7ba5;hb=1539c893f863596db71ab2af75a802e935496940;hpb=62781a2daa24d74a3c590e2669fad1fa7cabf933 diff --git a/bcftools/prob1.c b/bcftools/prob1.c index e3b0f73..193c4a0 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -8,9 +8,9 @@ #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) -#define MC_AVG_ERR 0.007 #define MC_MAX_EM_ITER 16 #define MC_EM_EPS 1e-4 +#define MC_DEF_INDEL 0.15 unsigned char seq_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -32,9 +32,9 @@ unsigned char seq_nt4_table[256] = { }; struct __bcf_p1aux_t { - int n, M, n1; + int n, M, n1, is_indel; double *q2p, *pdg; // pdg -> P(D|g) - double *phi; + double *phi, *phi_indel; double *z, *zswap; // aux for afs double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set double t, t1, t2; @@ -43,6 +43,14 @@ struct __bcf_p1aux_t { int PL_len; }; +void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x) +{ + int i; + for (i = 0; i < ma->M; ++i) + ma->phi_indel[i] = ma->phi[i] * x; + ma->phi_indel[ma->M] = 1. - ma->phi[ma->M] * x; +} + static void init_prior(int type, double theta, int M, double *phi) { int i; @@ -63,6 +71,7 @@ static void init_prior(int type, double theta, int M, double *phi) void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta) { init_prior(type, theta, ma->M, ma->phi); + bcf_p1_indel_prior(ma, MC_DEF_INDEL); } void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta) @@ -110,6 +119,7 @@ int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn) fprintf(stderr, "[%s] heterozygosity=%lf, ", __func__, (double)sum); for (sum = 0., k = 1; k <= ma->M; ++k) sum += k * ma->phi[ma->M - k] / ma->M; fprintf(stderr, "theta=%lf\n", (double)sum); + bcf_p1_indel_prior(ma, MC_DEF_INDEL); return 0; } @@ -123,6 +133,7 @@ bcf_p1aux_t *bcf_p1_init(int n) ma->q2p = calloc(256, sizeof(double)); ma->pdg = calloc(3 * ma->n, sizeof(double)); ma->phi = calloc(ma->M + 1, sizeof(double)); + ma->phi_indel = calloc(ma->M + 1, sizeof(double)); ma->phi1 = calloc(ma->M + 1, sizeof(double)); ma->phi2 = calloc(ma->M + 1, sizeof(double)); ma->z = calloc(2 * ma->n + 1, sizeof(double)); @@ -148,7 +159,7 @@ void bcf_p1_destroy(bcf_p1aux_t *ma) { if (ma) { free(ma->q2p); free(ma->pdg); - free(ma->phi); free(ma->phi1); free(ma->phi2); + free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2); free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2); free(ma->afs); free(ma->afs1); free(ma); @@ -157,18 +168,16 @@ void bcf_p1_destroy(bcf_p1aux_t *ma) static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) { - int i, j, k; + int i, j; long *p, tmp; p = alloca(b->n_alleles * sizeof(long)); memset(p, 0, sizeof(long) * b->n_alleles); for (j = 0; j < ma->n; ++j) { const uint8_t *pi = ma->PL + j * ma->PL_len; double *pdg = ma->pdg + j * 3; - pdg[0] = ma->q2p[pi[b->n_alleles]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; - for (i = k = 0; i < b->n_alleles; ++i) { - p[i] += (int)pi[k]; - k += b->n_alleles - i; - } + pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; + for (i = 0; i < b->n_alleles; ++i) + p[i] += (int)pi[(i+1)*(i+2)/2-1]; } for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i; for (i = 1; i < b->n_alleles; ++i) // insertion sort @@ -299,18 +308,28 @@ static void contrast(bcf_p1aux_t *ma, double pc[4]) // mc_cal_y() must be called pc[1] = pc[1] == 1.? 99 : (int)(-4.343 * log(1. - pc[1]) + .499); } -static double mc_cal_afs(bcf_p1aux_t *ma) +static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded) { int k; - long double sum = 0.; + long double sum = 0., sum2; + double *phi = ma->is_indel? ma->phi_indel : ma->phi; memset(ma->afs1, 0, sizeof(double) * (ma->M + 1)); mc_cal_y(ma); + // compute AFS for (k = 0, sum = 0.; k <= ma->M; ++k) - sum += (long double)ma->phi[k] * ma->z[k]; + sum += (long double)phi[k] * ma->z[k]; for (k = 0; k <= ma->M; ++k) { - ma->afs1[k] = ma->phi[k] * ma->z[k] / sum; + ma->afs1[k] = phi[k] * ma->z[k] / sum; if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.; } + // compute folded variant probability + for (k = 0, sum = 0.; k <= ma->M; ++k) + sum += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k]; + for (k = 1, sum2 = 0.; k < ma->M; ++k) + sum2 += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k]; + *p_var_folded = sum2 / sum; + *p_ref_folded = (phi[k] + phi[ma->M - k]) / 2. * (ma->z[ma->M] + ma->z[0]) / sum; + // the expected frequency for (k = 0, sum = 0.; k <= ma->M; ++k) { ma->afs[k] += ma->afs1[k]; sum += k * ma->afs1[k]; @@ -344,10 +363,11 @@ long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3]) return pd; } -int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) +int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) { int i, k; long double sum = 0.; + ma->is_indel = bcf_is_indel(b); // set PL and PL_len for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("PL", 2)) { @@ -359,8 +379,11 @@ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) if (b->n_alleles < 2) return -1; // FIXME: find a better solution // rst->rank0 = cal_pdg(b, ma); - rst->f_exp = mc_cal_afs(ma); + rst->f_exp = mc_cal_afs(ma, &rst->p_ref_folded, &rst->p_var_folded); rst->p_ref = ma->afs1[ma->M]; + for (k = 0, sum = 0.; k < ma->M; ++k) + sum += ma->afs1[k]; + rst->p_var = (double)sum; // calculate f_flat and f_em for (k = 0, sum = 0.; k <= ma->M; ++k) sum += (long double)ma->z[k]; @@ -378,6 +401,19 @@ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) flast = rst->f_em; } } + { // estimate equal-tail credible interval (95% level) + int l, h; + double p; + for (i = 0, p = 0.; i < ma->M; ++i) + if (p + ma->afs1[i] > 0.025) break; + else p += ma->afs1[i]; + l = i; + for (i = ma->M-1, p = 0.; i >= 0; --i) + if (p + ma->afs1[i] > 0.025) break; + else p += ma->afs1[i]; + h = i; + rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M; + } rst->g[0] = rst->g[1] = rst->g[2] = -1.; contrast(ma, rst->pc); return 0;