+------------------------------------------------------------------------
+r844 | lh3lh3 | 2010-11-19 23:16:08 -0500 (Fri, 19 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/bcftools/prob1.c
+ M /trunk/samtools/bcftools/prob1.h
+
+ * samtools-0.1.10-9 (r844)
+ * added the "folded" or reference-free mode for variant calling
+
+------------------------------------------------------------------------
+r843 | lh3lh3 | 2010-11-19 22:26:36 -0500 (Fri, 19 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/NEWS
+ M /trunk/samtools/bam_sort.c
+
+In merging, if -R is specified, do not abort if the sequence dictionary is different.
+
+------------------------------------------------------------------------
+r842 | jmarshall | 2010-11-19 21:24:28 -0500 (Fri, 19 Nov 2010) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam_sort.c
+
+When merging BAM headers, compare the list of target reference sequences
+strictly (and fail/abort if there is a mismatch), but allow one list to be a
+prefix of the other. (i.e., check that the lists are identical up until the
+shorter runs out, and add the excess targets from the longer to the output.)
+
+------------------------------------------------------------------------
+r841 | lh3lh3 | 2010-11-19 14:49:27 -0500 (Fri, 19 Nov 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/bam_pileup.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10 (r841)
+ * fixed a bug in pileup when the first CIGAR operation is D
+ * fixed a bug in view with range query
+
+------------------------------------------------------------------------
+r840 | lh3lh3 | 2010-11-19 13:45:51 -0500 (Fri, 19 Nov 2010) | 10 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10-4 (r840)
+
+ * drop the MNP caller. It is slow while does not diliver too much
+ benefit. Possibly I will work on it in future given more time.
+
+ * there is a segfault in pileup
+
+ * someone has reported segfault from view/index/sort
+
+
+------------------------------------------------------------------------
+r839 | lh3lh3 | 2010-11-18 17:30:11 -0500 (Thu, 18 Nov 2010) | 9 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10-6 (r839)
+
+ * call MNPs without realignment because it seems to me that it is not
+ worthwhile to significantly slow down SNP calling.
+
+ * the result looks quite different from the previous version. I have
+ work to do...
+
+
+------------------------------------------------------------------------
+r838 | lh3lh3 | 2010-11-18 11:26:09 -0500 (Thu, 18 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/knetfile.c
+
+Apply a patch by Rob Davis, which improves fault detection.
+
+------------------------------------------------------------------------
+r836 | lh3lh3 | 2010-11-18 11:09:23 -0500 (Thu, 18 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-r836
+ * initiate MNP realignment when the MNP has at least 0.2% frequency (otherwise too slow)
+
+------------------------------------------------------------------------
+r835 | lh3lh3 | 2010-11-18 00:25:13 -0500 (Thu, 18 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+ * modify the filtering rule: also filter SNPs around filtered indels
+ * added MNP filter
+
+------------------------------------------------------------------------
+r834 | lh3lh3 | 2010-11-17 23:13:52 -0500 (Wed, 17 Nov 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10-4 (r834)
+ * fixed a silly bug in printing MNP
+ * restrict to at most 1 alternative allele
+
+------------------------------------------------------------------------
+r833 | lh3lh3 | 2010-11-17 21:58:58 -0500 (Wed, 17 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bamtk.c
+
+fixed a bug in printing MNPs
+
+------------------------------------------------------------------------
+r832 | lh3lh3 | 2010-11-17 21:47:20 -0500 (Wed, 17 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+minor change to how seqQ is applied
+
+------------------------------------------------------------------------
+r831 | lh3lh3 | 2010-11-17 21:41:12 -0500 (Wed, 17 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10 (r831)
+ * initial MNP caller
+
+------------------------------------------------------------------------
+r829 | lh3lh3 | 2010-11-16 23:14:15 -0500 (Tue, 16 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/NEWS
+ M /trunk/samtools/bamtk.c
+
+Release samtools-0.1.10 (r829)
+
------------------------------------------------------------------------
r828 | lh3lh3 | 2010-11-16 20:48:49 -0500 (Tue, 16 Nov 2010) | 2 lines
Changed paths:
+Beta release 0.1.11 (21 November, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This is mainly a bug fix release:
+
+ * Fixed a bug in random retrieval (since 0.1.8). It occurs when reads
+ are retrieved from a small region containing no reads.
+
+ * Fixed a bug in pileup (since 0.1.9). The bug causes an assertion
+ failure when the first CIGAR operation is a deletion.
+
+ * Improved fault tolerence in remote access.
+
+One minor feature has been implemented in bcftools:
+
+ * Added a reference-free variant calling mode. In this mode, a site is
+ regarded as a variat iff the sample(s) contains two or more alleles;
+ the meaning of the QUAL field in the VCF output is changed
+ accordingly. Effectively, the reference allele is irrelevant to the
+ result in the new mode, although the reference sequence has to be
+ used in realignment when SAMtools computes genotype likelihoods.
+
+In addition, since 0.1.10, the `pileup' command has been deprecated by
+`mpileup' which is more powerful and more accurate. The `pileup' command
+will not be removed in the next few releases, but new features will not
+be added.
+
+(0.1.11: 21 November 2010, r851)
+
+
+
Beta Release 0.1.10 (16 November, 2010)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
}
}
free(bins);
+ if (n_off == 0) {
+ free(off); return iter;
+ }
{
bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
int l;
is_head = 1;
if (c->n_cigar == 1) { // just one operation, save a loop
if (_cop(cigar[0]) == BAM_CMATCH) s->k = 0, s->x = c->pos, s->y = 0;
- } else { // find the first match
+ } else { // find the first match or deletion
for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
int op = _cop(cigar[k]);
int l = _cln(cigar[k]);
- if (op == BAM_CMATCH) break;
- else if (op == BAM_CDEL || op == BAM_CREF_SKIP) s->x += l;
+ if (op == BAM_CMATCH || op == BAM_CDEL) break;
+ else if (op == BAM_CREF_SKIP) s->x += l;
else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
}
assert(k < c->n_cigar);
KSORT_INIT(heap, heap1_t, heap_lt)
+static void swap_header_targets(bam_header_t *h1, bam_header_t *h2)
+{
+ bam_header_t t;
+ t.n_targets = h1->n_targets, h1->n_targets = h2->n_targets, h2->n_targets = t.n_targets;
+ t.target_name = h1->target_name, h1->target_name = h2->target_name, h2->target_name = t.target_name;
+ t.target_len = h1->target_len, h1->target_len = h2->target_len, h2->target_len = t.target_len;
+}
+
static void swap_header_text(bam_header_t *h1, bam_header_t *h2)
{
int tempi;
return -1;
}
hin = bam_header_read(fp[i]);
- if (i == 0) { // the first SAM
+ if (i == 0) { // the first BAM
hout = hin;
- if (hheaders) {
- // If the text headers to be swapped in include any @SQ headers,
- // check that they are consistent with the existing binary list
- // of reference information.
- if (hheaders->n_targets > 0) {
- if (hout->n_targets != hheaders->n_targets) {
- fprintf(stderr, "[bam_merge_core] number of @SQ headers in `%s' differs from number of target sequences", headers);
- if (!reg) return -1;
- }
- for (j = 0; j < hout->n_targets; ++j)
- if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) {
- fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence", hheaders->target_name[j], headers);
- if (!reg) return -1;
- }
- }
- swap_header_text(hout, hheaders);
- bam_header_destroy(hheaders);
- hheaders = NULL;
- }
} else { // validate multiple baf
- if (hout->n_targets != hin->n_targets) {
- fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Continue anyway!\n", fn[i]);
- } else {
- for (j = 0; j < hout->n_targets; ++j) {
- if (strcmp(hout->target_name[j], hin->target_name[j])) {
- fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'. Continue anyway!\n",
- hout->target_name[j], hin->target_name[j], fn[i]);
- }
+ int min_n_targets = hout->n_targets;
+ if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets;
+
+ for (j = 0; j < min_n_targets; ++j)
+ if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) {
+ fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n",
+ hout->target_name[j], hin->target_name[j], fn[i]);
+ return -1;
}
+
+ // If this input file has additional target reference sequences,
+ // add them to the headers to be output
+ if (hin->n_targets > hout->n_targets) {
+ swap_header_targets(hout, hin);
+ // FIXME Possibly we should also create @SQ text headers
+ // for the newly added reference sequences
}
+
bam_header_destroy(hin);
}
}
+ if (hheaders) {
+ // If the text headers to be swapped in include any @SQ headers,
+ // check that they are consistent with the existing binary list
+ // of reference information.
+ if (hheaders->n_targets > 0) {
+ if (hout->n_targets != hheaders->n_targets) {
+ fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers);
+ if (!reg) return -1;
+ }
+ for (j = 0; j < hout->n_targets; ++j)
+ if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) {
+ fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers);
+ if (!reg) return -1;
+ }
+ }
+
+ swap_header_text(hout, hheaders);
+ bam_header_destroy(hheaders);
+ }
+
if (reg) {
int tid, beg, end;
if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) {
#endif
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.10 (r829)"
+#define PACKAGE_VERSION "0.1.11 (r851)"
#endif
int bam_taf2baf(int argc, char *argv[]);
.B -Q
Output the QCALL likelihood format
.TP
+.B -f
+Reference-free variant calling mode. In this mode, the prior will be
+folded; a variant is called iff the sample(s) contains at least two
+alleles; the QUAL field in the VCF/BCF output is changed accordingly.
+.TP
.BI "-1 " INT
Number of group-1 samples. This option is used for dividing input into
two groups for comparing. A zero value disables this functionality. [0]
#define VC_CALL_GT 2048
#define VC_ADJLD 4096
#define VC_NO_INDEL 8192
+#define VC_FOLDED 16384
typedef struct {
int flag, prior_type, n1;
tid = begin = end = -1;
memset(&vc, 0, sizeof(viewconf_t));
vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.;
- while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:I")) >= 0) {
+ while ((c = getopt(argc, argv, "fN1:l:cHAGvbSuP:t:p:QgLi:I")) >= 0) {
switch (c) {
+ case 'f': vc.flag |= VC_FOLDED; break;
case '1': vc.n1 = atoi(optarg); break;
case 'l': vc.fn_list = strdup(optarg); break;
case 'N': vc.flag |= VC_ACGT_ONLY; break;
fprintf(stderr, " -Q output the QCALL likelihood format\n");
fprintf(stderr, " -L calculate LD for adjacent sites\n");
fprintf(stderr, " -I skip indels\n");
+ fprintf(stderr, " -f reference-free variant calling\n");
fprintf(stderr, " -1 INT number of group-1 samples [0]\n");
fprintf(stderr, " -l FILE list of sites to output [all sites]\n");
fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4lg]\n", vc.theta);
bcf_p1_set_n1(p1, vc.n1);
bcf_p1_init_subprior(p1, vc.prior_type, vc.theta);
}
- if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac);
+ if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac
+ if (vc.flag & VC_FOLDED) bcf_p1_set_folded(p1);
}
if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, h);
if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) {
};
struct __bcf_p1aux_t {
- int n, M, n1, is_indel;
+ int n, M, n1, is_indel, is_folded;
double *q2p, *pdg; // pdg -> P(D|g)
double *phi, *phi_indel;
double *z, *zswap; // aux for afs
int PL_len;
};
+static void fold_array(int M, double *x)
+{
+ int k;
+ for (k = 0; k < M/2; ++k)
+ x[k] = x[M-k] = (x[k] + x[M-k]) / 2.;
+}
+
void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x)
{
int i;
return 0;
}
+void bcf_p1_set_folded(bcf_p1aux_t *p1a)
+{
+ if (p1a->n1 < 0) {
+ p1a->is_folded = 1;
+ fold_array(p1a->M, p1a->phi);
+ fold_array(p1a->M, p1a->phi_indel);
+ }
+}
+
void bcf_p1_destroy(bcf_p1aux_t *ma)
{
if (ma) {
//
rst->rank0 = cal_pdg(b, ma);
rst->f_exp = mc_cal_afs(ma);
- rst->p_ref = ma->afs1[ma->M];
+ rst->p_ref = ma->is_folded? ma->afs1[ma->M] + ma->afs1[0] : ma->afs1[ma->M];
// calculate f_flat and f_em
for (k = 0, sum = 0.; k <= ma->M; ++k)
sum += (long double)ma->z[k];
void bcf_p1_dump_afs(bcf_p1aux_t *ma)
{
int k;
+ if (ma->is_folded) fold_array(ma->M, ma->afs);
fprintf(stderr, "[afs]");
for (k = 0; k <= ma->M; ++k)
fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]);
int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn);
long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3]);
int bcf_p1_set_n1(bcf_p1aux_t *b, int n1);
+ void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called
#ifdef __cplusplus
}
}
next if ($t[4] eq '.'); # skip non-var sites
# check if the site is a SNP
- my $is_snp = 1;
+ my $type = 1; # SNP
if (length($t[3]) > 1) {
- $is_snp = 0;
+ $type = 2; # MNP
+ my @s = split(',', $t[4]);
+ for (@s) {
+ $type = 3 if (length != length($t[3]));
+ }
} else {
my @s = split(',', $t[4]);
for (@s) {
- $is_snp = 0 if (length > 1);
+ $type = 3 if (length > 1);
}
}
# clear the out-of-range elements
$flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
&& ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
- # site dependent filters
- my ($rlen, $indel_score) = (0, -1); # $indel_score<0 for SNPs
+ my $score = $t[5] * 100 + $dp_alt;
+ my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
if ($flt == 0) {
- if (!$is_snp) { # an indel
- $rlen = length($t[3]) - 1;
- $indel_score = $t[5] * 100 + $dp_alt;
- # filtering SNPs
+ if ($type == 3) { # an indel
+ # filtering SNPs and MNPs
for my $x (@staging) {
- next if ($x->[0] >= 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
+ next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
$x->[1] = 5;
}
# check the staging list for indel filtering
for my $x (@staging) {
- next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
- if ($x->[0] < $indel_score) {
+ next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
+ if ($x->[0]>>2 < $score) {
$x->[1] = 6;
} else {
$flt = 6; last;
}
}
- } else { # a SNP
+ } else { # SNP or MNP
for my $x (@staging) {
- next if ($x->[0] < 0 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
+ next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
$flt = 5;
last;
}
+ # check MNP
+ for my $x (@staging) {
+ next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]);
+ if ($x->[0]>>2 < $score) {
+ $x->[1] = 8;
+ } else {
+ $flt = 8; last;
+ }
+ }
}
}
- push(@staging, [$indel_score, $flt, $rlen, @t]);
+ push(@staging, [$score<<2|$type, $flt, $rlen, @t]);
}
# output the last few elements in the staging list
while (@staging) {
if ($first->[1] == 0) {
print join("\t", @$first[3 .. @$first-1]), "\n";
} elsif ($is_print) {
- print STDERR join("\t", substr("UQdDaGgP", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
+ print STDERR join("\t", substr("UQdDaGgPM", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
}
}
if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
off_t rest = len, curr;
while (rest) {
- curr = read(fp->fd, buf + l, rest);
+ do {
+ curr = read(fp->fd, buf + l, rest);
+ } while (curr < 0 && EINTR == errno);
+ if (curr < 0) return -1;
if (curr == 0) break;
l += curr; rest -= curr;
}
-.TH samtools 1 "15 November 2010" "samtools-0.1.10" "Bioinformatics tools"
+.TH samtools 1 "21 November 2010" "samtools-0.1.11" "Bioinformatics tools"
.SH NAME
.PP
samtools - Utilities for the Sequence Alignment/Map (SAM) format
-samtools-0.1.10 15 November 2010 samtools(1)
+samtools-0.1.11 21 November 2010 samtools(1)