From: Charles Plessy Date: Fri, 3 Dec 2010 04:24:34 +0000 (+0900) Subject: Imported Upstream version 0.1.11 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=commitdiff_plain;h=6a0c6f060a60789b48e10a72b1381f6e54599302 Imported Upstream version 0.1.11 --- diff --git a/ChangeLog b/ChangeLog index f8456a3..dd62b49 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,151 @@ +------------------------------------------------------------------------ +r844 | lh3lh3 | 2010-11-19 23:16:08 -0500 (Fri, 19 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/prob1.h + + * samtools-0.1.10-9 (r844) + * added the "folded" or reference-free mode for variant calling + +------------------------------------------------------------------------ +r843 | lh3lh3 | 2010-11-19 22:26:36 -0500 (Fri, 19 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/NEWS + M /trunk/samtools/bam_sort.c + +In merging, if -R is specified, do not abort if the sequence dictionary is different. + +------------------------------------------------------------------------ +r842 | jmarshall | 2010-11-19 21:24:28 -0500 (Fri, 19 Nov 2010) | 5 lines +Changed paths: + M /trunk/samtools/bam_sort.c + +When merging BAM headers, compare the list of target reference sequences +strictly (and fail/abort if there is a mismatch), but allow one list to be a +prefix of the other. (i.e., check that the lists are identical up until the +shorter runs out, and add the excess targets from the longer to the output.) + +------------------------------------------------------------------------ +r841 | lh3lh3 | 2010-11-19 14:49:27 -0500 (Fri, 19 Nov 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam_index.c + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.10 (r841) + * fixed a bug in pileup when the first CIGAR operation is D + * fixed a bug in view with range query + +------------------------------------------------------------------------ +r840 | lh3lh3 | 2010-11-19 13:45:51 -0500 (Fri, 19 Nov 2010) | 10 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.10-4 (r840) + + * drop the MNP caller. It is slow while does not diliver too much + benefit. Possibly I will work on it in future given more time. + + * there is a segfault in pileup + + * someone has reported segfault from view/index/sort + + +------------------------------------------------------------------------ +r839 | lh3lh3 | 2010-11-18 17:30:11 -0500 (Thu, 18 Nov 2010) | 9 lines +Changed paths: + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.10-6 (r839) + + * call MNPs without realignment because it seems to me that it is not + worthwhile to significantly slow down SNP calling. + + * the result looks quite different from the previous version. I have + work to do... + + +------------------------------------------------------------------------ +r838 | lh3lh3 | 2010-11-18 11:26:09 -0500 (Thu, 18 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/knetfile.c + +Apply a patch by Rob Davis, which improves fault detection. + +------------------------------------------------------------------------ +r836 | lh3lh3 | 2010-11-18 11:09:23 -0500 (Thu, 18 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + + * samtools-r836 + * initiate MNP realignment when the MNP has at least 0.2% frequency (otherwise too slow) + +------------------------------------------------------------------------ +r835 | lh3lh3 | 2010-11-18 00:25:13 -0500 (Thu, 18 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + + * modify the filtering rule: also filter SNPs around filtered indels + * added MNP filter + +------------------------------------------------------------------------ +r834 | lh3lh3 | 2010-11-17 23:13:52 -0500 (Wed, 17 Nov 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.10-4 (r834) + * fixed a silly bug in printing MNP + * restrict to at most 1 alternative allele + +------------------------------------------------------------------------ +r833 | lh3lh3 | 2010-11-17 21:58:58 -0500 (Wed, 17 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bamtk.c + +fixed a bug in printing MNPs + +------------------------------------------------------------------------ +r832 | lh3lh3 | 2010-11-17 21:47:20 -0500 (Wed, 17 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + +minor change to how seqQ is applied + +------------------------------------------------------------------------ +r831 | lh3lh3 | 2010-11-17 21:41:12 -0500 (Wed, 17 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.10 (r831) + * initial MNP caller + +------------------------------------------------------------------------ +r829 | lh3lh3 | 2010-11-16 23:14:15 -0500 (Tue, 16 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + M /trunk/samtools/bamtk.c + +Release samtools-0.1.10 (r829) + ------------------------------------------------------------------------ r828 | lh3lh3 | 2010-11-16 20:48:49 -0500 (Tue, 16 Nov 2010) | 2 lines Changed paths: diff --git a/NEWS b/NEWS index 6c2195e..478e718 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,34 @@ +Beta release 0.1.11 (21 November, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is mainly a bug fix release: + + * Fixed a bug in random retrieval (since 0.1.8). It occurs when reads + are retrieved from a small region containing no reads. + + * Fixed a bug in pileup (since 0.1.9). The bug causes an assertion + failure when the first CIGAR operation is a deletion. + + * Improved fault tolerence in remote access. + +One minor feature has been implemented in bcftools: + + * Added a reference-free variant calling mode. In this mode, a site is + regarded as a variat iff the sample(s) contains two or more alleles; + the meaning of the QUAL field in the VCF output is changed + accordingly. Effectively, the reference allele is irrelevant to the + result in the new mode, although the reference sequence has to be + used in realignment when SAMtools computes genotype likelihoods. + +In addition, since 0.1.10, the `pileup' command has been deprecated by +`mpileup' which is more powerful and more accurate. The `pileup' command +will not be removed in the next few releases, but new features will not +be added. + +(0.1.11: 21 November 2010, r851) + + + Beta Release 0.1.10 (16 November, 2010) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bam_index.c b/bam_index.c index 1ad2e93..f60a6f8 100644 --- a/bam_index.c +++ b/bam_index.c @@ -608,6 +608,9 @@ bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end) } } free(bins); + if (n_off == 0) { + free(off); return iter; + } { bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t)); int l; diff --git a/bam_pileup.c b/bam_pileup.c index 55e51e2..3e26f74 100644 --- a/bam_pileup.c +++ b/bam_pileup.c @@ -79,12 +79,12 @@ static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s) is_head = 1; if (c->n_cigar == 1) { // just one operation, save a loop if (_cop(cigar[0]) == BAM_CMATCH) s->k = 0, s->x = c->pos, s->y = 0; - } else { // find the first match + } else { // find the first match or deletion for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { int op = _cop(cigar[k]); int l = _cln(cigar[k]); - if (op == BAM_CMATCH) break; - else if (op == BAM_CDEL || op == BAM_CREF_SKIP) s->x += l; + if (op == BAM_CMATCH || op == BAM_CDEL) break; + else if (op == BAM_CREF_SKIP) s->x += l; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; } assert(k < c->n_cigar); diff --git a/bam_sort.c b/bam_sort.c index 76ab793..01f7016 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -52,6 +52,14 @@ static inline int heap_lt(const heap1_t a, const heap1_t b) KSORT_INIT(heap, heap1_t, heap_lt) +static void swap_header_targets(bam_header_t *h1, bam_header_t *h2) +{ + bam_header_t t; + t.n_targets = h1->n_targets, h1->n_targets = h2->n_targets, h2->n_targets = t.n_targets; + t.target_name = h1->target_name, h1->target_name = h2->target_name, h2->target_name = t.target_name; + t.target_len = h1->target_len, h1->target_len = h2->target_len, h2->target_len = t.target_len; +} + static void swap_header_text(bam_header_t *h1, bam_header_t *h2) { int tempi; @@ -130,42 +138,51 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch return -1; } hin = bam_header_read(fp[i]); - if (i == 0) { // the first SAM + if (i == 0) { // the first BAM hout = hin; - if (hheaders) { - // If the text headers to be swapped in include any @SQ headers, - // check that they are consistent with the existing binary list - // of reference information. - if (hheaders->n_targets > 0) { - if (hout->n_targets != hheaders->n_targets) { - fprintf(stderr, "[bam_merge_core] number of @SQ headers in `%s' differs from number of target sequences", headers); - if (!reg) return -1; - } - for (j = 0; j < hout->n_targets; ++j) - if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) { - fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence", hheaders->target_name[j], headers); - if (!reg) return -1; - } - } - swap_header_text(hout, hheaders); - bam_header_destroy(hheaders); - hheaders = NULL; - } } else { // validate multiple baf - if (hout->n_targets != hin->n_targets) { - fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Continue anyway!\n", fn[i]); - } else { - for (j = 0; j < hout->n_targets; ++j) { - if (strcmp(hout->target_name[j], hin->target_name[j])) { - fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'. Continue anyway!\n", - hout->target_name[j], hin->target_name[j], fn[i]); - } + int min_n_targets = hout->n_targets; + if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets; + + for (j = 0; j < min_n_targets; ++j) + if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) { + fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n", + hout->target_name[j], hin->target_name[j], fn[i]); + return -1; } + + // If this input file has additional target reference sequences, + // add them to the headers to be output + if (hin->n_targets > hout->n_targets) { + swap_header_targets(hout, hin); + // FIXME Possibly we should also create @SQ text headers + // for the newly added reference sequences } + bam_header_destroy(hin); } } + if (hheaders) { + // If the text headers to be swapped in include any @SQ headers, + // check that they are consistent with the existing binary list + // of reference information. + if (hheaders->n_targets > 0) { + if (hout->n_targets != hheaders->n_targets) { + fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers); + if (!reg) return -1; + } + for (j = 0; j < hout->n_targets; ++j) + if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) { + fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers); + if (!reg) return -1; + } + } + + swap_header_text(hout, hheaders); + bam_header_destroy(hheaders); + } + if (reg) { int tid, beg, end; if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) { diff --git a/bamtk.c b/bamtk.c index 88328d4..4c2a4a2 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.10 (r829)" +#define PACKAGE_VERSION "0.1.11 (r851)" #endif int bam_taf2baf(int argc, char *argv[]); diff --git a/bcftools/bcftools.1 b/bcftools/bcftools.1 index ebff301..6c7403b 100644 --- a/bcftools/bcftools.1 +++ b/bcftools/bcftools.1 @@ -74,6 +74,11 @@ Skip sites where the REF field is not A/C/G/T .B -Q Output the QCALL likelihood format .TP +.B -f +Reference-free variant calling mode. In this mode, the prior will be +folded; a variant is called iff the sample(s) contains at least two +alleles; the QUAL field in the VCF/BCF output is changed accordingly. +.TP .BI "-1 " INT Number of group-1 samples. This option is used for dividing input into two groups for comparing. A zero value disables this functionality. [0] diff --git a/bcftools/call1.c b/bcftools/call1.c index b0b14fc..5e4fc9f 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -26,6 +26,7 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define VC_CALL_GT 2048 #define VC_ADJLD 4096 #define VC_NO_INDEL 8192 +#define VC_FOLDED 16384 typedef struct { int flag, prior_type, n1; @@ -216,8 +217,9 @@ int bcfview(int argc, char *argv[]) tid = begin = end = -1; memset(&vc, 0, sizeof(viewconf_t)); vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; - while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:I")) >= 0) { + while ((c = getopt(argc, argv, "fN1:l:cHAGvbSuP:t:p:QgLi:I")) >= 0) { switch (c) { + case 'f': vc.flag |= VC_FOLDED; break; case '1': vc.n1 = atoi(optarg); break; case 'l': vc.fn_list = strdup(optarg); break; case 'N': vc.flag |= VC_ACGT_ONLY; break; @@ -260,6 +262,7 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, " -Q output the QCALL likelihood format\n"); fprintf(stderr, " -L calculate LD for adjacent sites\n"); fprintf(stderr, " -I skip indels\n"); + fprintf(stderr, " -f reference-free variant calling\n"); fprintf(stderr, " -1 INT number of group-1 samples [0]\n"); fprintf(stderr, " -l FILE list of sites to output [all sites]\n"); fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4lg]\n", vc.theta); @@ -293,7 +296,8 @@ int bcfview(int argc, char *argv[]) bcf_p1_set_n1(p1, vc.n1); bcf_p1_init_subprior(p1, vc.prior_type, vc.theta); } - if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); + if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac + if (vc.flag & VC_FOLDED) bcf_p1_set_folded(p1); } if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, h); if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) { diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 176a0fc..8bf968f 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -32,7 +32,7 @@ unsigned char seq_nt4_table[256] = { }; struct __bcf_p1aux_t { - int n, M, n1, is_indel; + int n, M, n1, is_indel, is_folded; double *q2p, *pdg; // pdg -> P(D|g) double *phi, *phi_indel; double *z, *zswap; // aux for afs @@ -43,6 +43,13 @@ struct __bcf_p1aux_t { int PL_len; }; +static void fold_array(int M, double *x) +{ + int k; + for (k = 0; k < M/2; ++k) + x[k] = x[M-k] = (x[k] + x[M-k]) / 2.; +} + void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x) { int i; @@ -155,6 +162,15 @@ int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) return 0; } +void bcf_p1_set_folded(bcf_p1aux_t *p1a) +{ + if (p1a->n1 < 0) { + p1a->is_folded = 1; + fold_array(p1a->M, p1a->phi); + fold_array(p1a->M, p1a->phi_indel); + } +} + void bcf_p1_destroy(bcf_p1aux_t *ma) { if (ma) { @@ -373,7 +389,7 @@ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) // rst->rank0 = cal_pdg(b, ma); rst->f_exp = mc_cal_afs(ma); - rst->p_ref = ma->afs1[ma->M]; + rst->p_ref = ma->is_folded? ma->afs1[ma->M] + ma->afs1[0] : ma->afs1[ma->M]; // calculate f_flat and f_em for (k = 0, sum = 0.; k <= ma->M; ++k) sum += (long double)ma->z[k]; @@ -412,6 +428,7 @@ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) void bcf_p1_dump_afs(bcf_p1aux_t *ma) { int k; + if (ma->is_folded) fold_array(ma->M, ma->afs); fprintf(stderr, "[afs]"); for (k = 0; k <= ma->M; ++k) fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]); diff --git a/bcftools/prob1.h b/bcftools/prob1.h index 6d9d28e..3827534 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -32,6 +32,7 @@ extern "C" { int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3]); int bcf_p1_set_n1(bcf_p1aux_t *b, int n1); + void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called #ifdef __cplusplus } diff --git a/bcftools/vcfutils.pl b/bcftools/vcfutils.pl index bbc479b..6cc168f 100755 --- a/bcftools/vcfutils.pl +++ b/bcftools/vcfutils.pl @@ -231,13 +231,17 @@ Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. } next if ($t[4] eq '.'); # skip non-var sites # check if the site is a SNP - my $is_snp = 1; + my $type = 1; # SNP if (length($t[3]) > 1) { - $is_snp = 0; + $type = 2; # MNP + my @s = split(',', $t[4]); + for (@s) { + $type = 3 if (length != length($t[3])); + } } else { my @s = split(',', $t[4]); for (@s) { - $is_snp = 0 if (length > 1); + $type = 3 if (length > 1); } } # clear the out-of-range elements @@ -270,35 +274,42 @@ Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4})); - # site dependent filters - my ($rlen, $indel_score) = (0, -1); # $indel_score<0 for SNPs + my $score = $t[5] * 100 + $dp_alt; + my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs if ($flt == 0) { - if (!$is_snp) { # an indel - $rlen = length($t[3]) - 1; - $indel_score = $t[5] * 100 + $dp_alt; - # filtering SNPs + if ($type == 3) { # an indel + # filtering SNPs and MNPs for my $x (@staging) { - next if ($x->[0] >= 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]); + next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]); $x->[1] = 5; } # check the staging list for indel filtering for my $x (@staging) { - next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]); - if ($x->[0] < $indel_score) { + next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]); + if ($x->[0]>>2 < $score) { $x->[1] = 6; } else { $flt = 6; last; } } - } else { # a SNP + } else { # SNP or MNP for my $x (@staging) { - next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]); + next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]); $flt = 5; last; } + # check MNP + for my $x (@staging) { + next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]); + if ($x->[0]>>2 < $score) { + $x->[1] = 8; + } else { + $flt = 8; last; + } + } } } - push(@staging, [$indel_score, $flt, $rlen, @t]); + push(@staging, [$score<<2|$type, $flt, $rlen, @t]); } # output the last few elements in the staging list while (@staging) { @@ -311,7 +322,7 @@ sub varFilter_aux { if ($first->[1] == 0) { print join("\t", @$first[3 .. @$first-1]), "\n"; } elsif ($is_print) { - print STDERR join("\t", substr("UQdDaGgP", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; + print STDERR join("\t", substr("UQdDaGgPM", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; } } diff --git a/knetfile.c b/knetfile.c index e1be4d6..1e2c042 100644 --- a/knetfile.c +++ b/knetfile.c @@ -517,7 +517,10 @@ off_t knet_read(knetFile *fp, void *buf, off_t len) if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX off_t rest = len, curr; while (rest) { - curr = read(fp->fd, buf + l, rest); + do { + curr = read(fp->fd, buf + l, rest); + } while (curr < 0 && EINTR == errno); + if (curr < 0) return -1; if (curr == 0) break; l += curr; rest -= curr; } diff --git a/samtools.1 b/samtools.1 index e87bef7..e059560 100644 --- a/samtools.1 +++ b/samtools.1 @@ -1,4 +1,4 @@ -.TH samtools 1 "15 November 2010" "samtools-0.1.10" "Bioinformatics tools" +.TH samtools 1 "21 November 2010" "samtools-0.1.11" "Bioinformatics tools" .SH NAME .PP samtools - Utilities for the Sequence Alignment/Map (SAM) format diff --git a/samtools.txt b/samtools.txt index 41cabe5..5e4bb2a 100644 --- a/samtools.txt +++ b/samtools.txt @@ -556,4 +556,4 @@ SEE ALSO -samtools-0.1.10 15 November 2010 samtools(1) +samtools-0.1.11 21 November 2010 samtools(1)