DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=1
KNETFILE_O= knetfile.o
LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
- bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \
+ bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o bedidx.o \
$(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o
AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
- cut_target.o phase.o
+ cut_target.o phase.o bam2depth.o
PROG= samtools
INCLUDES= -I.
SUBDIRS= . bcftools misc
-Beta release 0.1.14 (16 March, 2011)
+Beta Release 0.1.15 (10 April, 2011)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Noteable changes:
+
+ * Allow to perform variant calling or to extract information in multiple
+ regions specified by a BED file (`samtools mpileup -l', `samtools view -L'
+ and `bcftools view -l').
+
+ * Added the `depth' command to samtools to compute the per-base depth with a
+ simpler interface. File `bam2depth.c', which implements this command, is the
+ recommended example on how to use the mpileup APIs.
+
+ * Estimate genotype frequencies with ML; perform chi^2 based Hardy-Weinberg
+ test using this estimate.
+
+ * For `samtools view', when `-R' is specified, drop read groups in the header
+ that are not contained in the specified file.
+
+ * For `samtools flagstat', separate QC-pass and QC-fail reads.
+
+ * Improved the command line help of `samtools mpileup' and `bcftools view'.
+
+ * Use a global variable to control the verbose level of samtools stderr
+ output. Nonetheless, it has not been full utilized.
+
+ * Fixed an issue in association test which may report false associations,
+ possibly due to floating point underflow.
+
+(0.1.15: 10 April 2011, r949:203)
+
+
+
+Beta release 0.1.14 (21 March, 2011)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This release implements a method for testing associations for case-control
Another new feature is to make X chromosome variant calls when female and male
samples are both present. The user needs to provide a file indicating the
-ploidy of each sample.
+ploidy of each sample (see also manual bcftools/bcftools.1).
+
+Other notable changes:
+
+ * Added `bcftools view -F' to parse BCF files generated by samtools r921 or
+ older which encodes PL in a different way.
-Other new features:
+ * Changed the behavior of `bcftools view -s'. Now when a list of samples is
+ provided, the samples in the output will be reordered to match the ordering
+ in the sample list. This change is mainly designed for association test.
+
+ * Sped up `bcftools view -v' for target sequencing given thousands of samples.
+ Also added a new option `view -d' to skip loci where only a few samples are
+ covered by reads.
+
+ * Dropped HWE test. This feature has never been implemented properly. An EM
+ should be much better. To be implemented in future.
+
+ * Added the `cat' command to samtools. This command concatenate BAMs with
+ identical sequence dictionaries in an efficient way. Modified from bam_cat.c
+ written by Chris Saunders.
+
+ * Added `samtools view -1' to write BAMs at a low compression level but twice
+ faster to create. The `sort' command generates temporary files at a low
+ compression level as well.
+
+ * Added `samtools mpileup -6' to accept "BAM" with Illumina 1.3+ quality
+ strings (strictly speaking, such a file is not BAM).
* Added `samtools mpileup -L' to skip INDEL calling in regions with
excessively high coverage. Such regions dramatically slow down mpileup.
- * Added `bcftools view -F' to parse BCF files generated by samtools r921 or
- older which encode PL in a different way.
+ * Updated `misc/export2sam.pl', provided by Chris Saunders from Illumina Inc.
-(0.1.14: 16 March 2011, r930:163)
+(0.1.14: 21 March 2011, r933:170)
#include "kstring.h"
#include "sam_header.h"
-int bam_is_be = 0;
+int bam_is_be = 0, bam_verbose = 2;
char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
/**************************
BAM library provides I/O and various operations on manipulating files
in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)
- format. It now supports importing from or exporting to TAM, sorting,
+ format. It now supports importing from or exporting to SAM, sorting,
merging, generating pileup, and quickly retrieval of reads overlapped
with a specified region.
@copyright Genome Research Ltd.
*/
-#define BAM_VERSION "0.1.14 (r933:170)"
+#define BAM_VERSION "0.1.15 (r949:203)"
#include <stdint.h>
#include <stdlib.h>
*/
extern int bam_is_be;
+/*!
+ @abstract Verbose level between 0 and 3; 0 is supposed to disable all
+ debugging information, though this may not have been implemented.
+ */
+extern int bam_verbose;
+
/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
extern unsigned char bam_nt16_table[256];
--- /dev/null
+/* This program demonstrates how to generate pileup from multiple BAMs
+ * simutaneously, to achieve random access and to use the BED interface.
+ * To compile this program separately, you may:
+ *
+ * gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -L. -lbam -lz
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "bam.h"
+
+typedef struct { // auxiliary data structure
+ bamFile fp; // the file handler
+ bam_iter_t iter; // NULL if a region not specified
+ int min_mapQ; // mapQ filter
+} aux_t;
+
+void *bed_read(const char *fn); // read a BED or position list file
+void bed_destroy(void *_h); // destroy the BED data structure
+int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps
+
+// This function reads a BAM alignment from one BAM file.
+static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
+{
+ aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
+ int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b);
+ if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
+ return ret;
+}
+
+#ifdef _MAIN_BAM2DEPTH
+int main(int argc, char *argv[])
+#else
+int main_depth(int argc, char *argv[])
+#endif
+{
+ int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0;
+ const bam_pileup1_t **plp;
+ char *reg = 0; // specified region
+ void *bed = 0; // BED data structure
+ bam_header_t *h = 0; // BAM header of the 1st input
+ aux_t **data;
+ bam_mplp_t mplp;
+
+ // parse the command line
+ while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) {
+ switch (n) {
+ case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header
+ case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now
+ case 'q': baseQ = atoi(optarg); break; // base quality threshold
+ case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold
+ }
+ }
+ if (optind == argc) {
+ fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] <in1.bam> [...]\n");
+ return 1;
+ }
+
+ // initialize the auxiliary data structures
+ n = argc - optind; // the number of BAMs on the command line
+ data = calloc(n, sizeof(void*)); // data[i] for the i-th input
+ beg = 0; end = 1<<30; tid = -1; // set the default region
+ for (i = 0; i < n; ++i) {
+ bam_header_t *htmp;
+ data[i] = calloc(1, sizeof(aux_t));
+ data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM
+ data[i]->min_mapQ = mapQ; // set the mapQ filter
+ htmp = bam_header_read(data[i]->fp); // read the BAM header
+ if (i == 0) {
+ h = htmp; // keep the header of the 1st BAM
+ if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region
+ } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header
+ if (tid >= 0) { // if a region is specified and parsed successfully
+ bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index
+ data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator
+ bam_index_destroy(idx); // the index is not needed any more; phase out of the memory
+ }
+ }
+
+ // the core multi-pileup loop
+ mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
+ n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
+ plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp)
+ while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position
+ if (pos < beg || pos >= end) continue; // out of range; skip
+ if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
+ fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
+ for (i = 0; i < n; ++i) { // base level filters have to go here
+ int j, m = 0;
+ for (j = 0; j < n_plp[i]; ++j) {
+ const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
+ if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
+ else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
+ }
+ printf("\t%d", n_plp[i] - m); // this the depth to output
+ }
+ putchar('\n');
+ }
+ free(n_plp); free(plp);
+ bam_mplp_destroy(mplp);
+
+ bam_header_destroy(h);
+ for (i = 0; i < n; ++i) {
+ bam_close(data[i]->fp);
+ if (data[i]->iter) bam_iter_destroy(data[i]->iter);
+ free(data[i]);
+ }
+ free(data); free(reg);
+ if (bed) bed_destroy(bed);
+ return 0;
+}
fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c) [%.4g]\n", d->ido->r_indel);
fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c) [%d]\n", d->ido->q_indel);
fprintf(stderr, "\n");
+ fprintf(stderr, "Warning: Please use the `mpileup' command instead `pileup'. `Pileup' is deprecated!\n\n");
free(fn_list); free(fn_fa); free(d);
return 1;
}
#define MPLP_EXT_BAQ 0x800
#define MPLP_ILLUMINA13 0x1000
+void *bed_read(const char *fn);
+void bed_destroy(void *_h);
+int bed_overlap(const void *_h, const char *chr, int beg, int end);
+
typedef struct {
int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth;
int openQ, extQ, tandemQ, min_support; // for indels
double min_frac; // for indels
- char *reg, *fn_pos, *pl_list;
+ char *reg, *pl_list;
faidx_t *fai;
- kh_64_t *hash;
- void *rghash;
+ void *bed, *rghash;
} mplp_conf_t;
typedef struct {
bamFile fp;
bam_iter_t iter;
+ bam_header_t *h;
int ref_id;
char *ref;
const mplp_conf_t *conf;
int has_ref;
ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b);
if (ret < 0) break;
+ if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) { // exclude unmapped reads
+ skip = 1;
+ continue;
+ }
+ if (ma->conf->bed) { // test overlap
+ skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
+ if (skip) continue;
+ }
if (ma->conf->rghash) { // exclude read groups
uint8_t *rg = bam_aux_get(b, "RG");
skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0);
int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres);
if (q < 0) skip = 1;
else if (b->core.qual > q) b->core.qual = q;
- } else if (b->core.flag&BAM_FUNMAP) skip = 1;
+ }
else if (b->core.qual < ma->conf->min_mq) skip = 1;
else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1;
} while (skip);
q = bam_aux_get(p->b, "RG");
if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf);
if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf);
- assert(id >= 0 && id < m->n);
+ if (id < 0 || id >= m->n) {
+ assert(q); // otherwise a bug
+ fprintf(stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]);
+ exit(1);
+ }
if (m->n_plp[id] == m->m_plp[id]) {
m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
m->plp[id] = realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]);
bam_mplp_t iter;
bam_header_t *h = 0;
char *ref;
- khash_t(64) *hash = 0;
void *rghash = 0;
bcf_callaux_t *bca = 0;
data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
data[i]->conf = conf;
h_tmp = bam_header_read(data[i]->fp);
+ data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
bam_smpl_add(sm, fn[i], h_tmp->text);
rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
if (conf->reg) {
gplp.plp = calloc(sm->n, sizeof(void*));
fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
- if (conf->fn_pos) hash = load_pos(conf->fn_pos, h);
// write the VCF header
if (conf->flag & MPLP_GLF) {
kstring_t s;
fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
if (max_depth * sm->n < 8000) {
max_depth = 8000 / sm->n;
- fprintf(stderr, "<%s> Set max per-sample depth to %d\n", __func__, max_depth);
+ fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
}
max_indel_depth = conf->max_indel_depth * sm->n;
bam_mplp_set_maxcnt(iter, max_depth);
while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
- if (hash) {
- khint_t k;
- k = kh_get(64, hash, (uint64_t)tid<<32 | pos);
- if (k == kh_end(hash)) continue;
- }
+ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
if (tid != ref_tid) {
free(ref); ref = 0;
if (conf->fai) ref = fai_fetch(conf->fai, h->target_name[tid], &ref_len);
for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
bcf_call_del_rghash(rghash);
- if (hash) { // free the hash table
- khint_t k;
- for (k = kh_begin(hash); k < kh_end(hash); ++k)
- if (kh_exist(hash, k)) free(kh_val(hash, k));
- kh_destroy(64, hash);
- }
bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr);
bam_mplp_destroy(iter);
bam_header_destroy(h);
break;
case 'd': mplp.max_depth = atoi(optarg); break;
case 'r': mplp.reg = strdup(optarg); break;
- case 'l': mplp.fn_pos = strdup(optarg); break;
+ case 'l': mplp.bed = bed_read(optarg); break;
case 'P': mplp.pl_list = strdup(optarg); break;
case 'g': mplp.flag |= MPLP_GLF; break;
case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break;
if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN;
if (argc == 1) {
fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n");
- fprintf(stderr, "Options: -f FILE reference sequence file [null]\n");
- fprintf(stderr, " -r STR region in which pileup is generated [null]\n");
- fprintf(stderr, " -l FILE list of positions (format: chr pos) [null]\n");
- fprintf(stderr, " -b FILE list of input BAM files [null]\n");
- fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq);
- fprintf(stderr, " -Q INT min base quality [%d]\n", mplp.min_baseQ);
- fprintf(stderr, " -q INT filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq);
- fprintf(stderr, " -d INT max per-BAM depth [%d]\n", mplp.max_depth);
- fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth);
- fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n");
- fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ);
- fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ);
- fprintf(stderr, " -h INT coefficient for homopolyer errors [%d]\n", mplp.tandemQ);
- fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support);
- fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac);
- fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n");
- fprintf(stderr, " -6 quality is in the Illumina-1.3+ encoding\n");
- fprintf(stderr, " -A use anomalous read pairs in SNP/INDEL calling\n");
- fprintf(stderr, " -g generate BCF output\n");
- fprintf(stderr, " -u do not compress BCF output\n");
- fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n");
- fprintf(stderr, " -B disable BAQ computation\n");
- fprintf(stderr, " -D output per-sample DP\n");
- fprintf(stderr, " -S output per-sample SP (strand bias P-value, slow)\n");
- fprintf(stderr, " -I do not perform indel calling\n");
+ fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n");
+ fprintf(stderr, "Input options:\n\n");
+ fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n");
+ fprintf(stderr, " -A count anomalous read pairs\n");
+ fprintf(stderr, " -B disable BAQ computation\n");
+ fprintf(stderr, " -b FILE list of input BAM files [null]\n");
+ fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth);
+ fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n");
+ fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n");
+ fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n");
+ fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n");
+ fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq);
+ fprintf(stderr, " -r STR region in which pileup is generated [null]\n");
+ fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq);
+ fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ);
+ fprintf(stderr, "\nOutput options:\n\n");
+ fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n");
+ fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n");
+ fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n");
+ fprintf(stderr, " -u generate uncompress BCF output\n");
+ fprintf(stderr, "\nSNP/INDEL genotype likelihoods options (effective with `-g' or `-u'):\n\n");
+ fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ);
+ fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac);
+ fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ);
+ fprintf(stderr, " -I do not perform indel calling\n");
+ fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth);
+ fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support);
+ fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ);
+ fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Notes: Assuming diploid individuals.\n\n");
return 1;
if (mplp.rghash) bcf_str2id_thorough_destroy(mplp.rghash);
free(mplp.reg); free(mplp.pl_list);
if (mplp.fai) fai_destroy(mplp.fai);
+ if (mplp.bed) bed_destroy(mplp.bed);
return 0;
}
#define MERGE_RG 1
#define MERGE_UNCOMP 2
+#define MERGE_LEVEL1 4
/*!
@abstract Merge multiple sorted BAM.
}
else h->pos = HEAP_EMPTY;
}
- if (flag & MERGE_UNCOMP) {
- fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu");
- } else {
- fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w");
- }
+ if (flag & MERGE_UNCOMP) fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu");
+ else if (flag & MERGE_LEVEL1) fpout = strcmp(out, "-")? bam_open(out, "w1") : bam_dopen(fileno(stdout), "w1");
+ else fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w");
if (fpout == 0) {
fprintf(stderr, "[%s] fail to create the output file.\n", __func__);
return -1;
int c, is_by_qname = 0, flag = 0, ret = 0;
char *fn_headers = NULL, *reg = 0;
- while ((c = getopt(argc, argv, "h:nruR:")) >= 0) {
+ while ((c = getopt(argc, argv, "h:nru1R:")) >= 0) {
switch (c) {
case 'r': flag |= MERGE_RG; break;
case 'h': fn_headers = strdup(optarg); break;
case 'n': is_by_qname = 1; break;
+ case '1': flag |= MERGE_LEVEL1; break;
case 'u': flag |= MERGE_UNCOMP; break;
case 'R': reg = strdup(optarg); break;
}
fprintf(stderr, "Options: -n sort by read names\n");
fprintf(stderr, " -r attach RG tag (inferred from file names)\n");
fprintf(stderr, " -u uncompressed BAM output\n");
+ fprintf(stderr, " -1 compress level 1\n");
fprintf(stderr, " -R STR merge file in the specified region STR [all]\n");
fprintf(stderr, " -h FILE copy the header in FILE to <out.bam> [in1.bam]\n\n");
fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n");
#include "bam.h"
typedef struct {
- long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good;
- long long n_sgltn, n_read1, n_read2;
- long long n_qcfail, n_dup;
- long long n_diffchr, n_diffhigh;
+ long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2];
+ long long n_sgltn[2], n_read1[2], n_read2[2];
+ long long n_dup[2];
+ long long n_diffchr[2], n_diffhigh[2];
} bam_flagstat_t;
#define flagstat_loop(s, c) do { \
- ++(s)->n_reads; \
+ int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \
+ ++(s)->n_reads[w]; \
if ((c)->flag & BAM_FPAIRED) { \
- ++(s)->n_pair_all; \
- if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good; \
- if ((c)->flag & BAM_FREAD1) ++(s)->n_read1; \
- if ((c)->flag & BAM_FREAD2) ++(s)->n_read2; \
- if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn; \
+ ++(s)->n_pair_all[w]; \
+ if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good[w]; \
+ if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \
+ if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \
+ if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \
if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \
- ++(s)->n_pair_map; \
+ ++(s)->n_pair_map[w]; \
if ((c)->mtid != (c)->tid) { \
- ++(s)->n_diffchr; \
- if ((c)->qual >= 5) ++(s)->n_diffhigh; \
+ ++(s)->n_diffchr[w]; \
+ if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \
} \
} \
} \
- if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped; \
- if ((c)->flag & BAM_FQCFAIL) ++(s)->n_qcfail; \
- if ((c)->flag & BAM_FDUP) ++(s)->n_dup; \
+ if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \
+ if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \
} while (0)
bam_flagstat_t *bam_flagstat_core(bamFile fp)
assert(fp);
header = bam_header_read(fp);
s = bam_flagstat_core(fp);
- printf("%lld in total\n", s->n_reads);
- printf("%lld QC failure\n", s->n_qcfail);
- printf("%lld duplicates\n", s->n_dup);
- printf("%lld mapped (%.2f%%)\n", s->n_mapped, (float)s->n_mapped / s->n_reads * 100.0);
- printf("%lld paired in sequencing\n", s->n_pair_all);
- printf("%lld read1\n", s->n_read1);
- printf("%lld read2\n", s->n_read2);
- printf("%lld properly paired (%.2f%%)\n", s->n_pair_good, (float)s->n_pair_good / s->n_pair_all * 100.0);
- printf("%lld with itself and mate mapped\n", s->n_pair_map);
- printf("%lld singletons (%.2f%%)\n", s->n_sgltn, (float)s->n_sgltn / s->n_pair_all * 100.0);
- printf("%lld with mate mapped to a different chr\n", s->n_diffchr);
- printf("%lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh);
+ printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
+ printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]);
+ printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0);
+ printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
+ printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]);
+ printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]);
+ printf("%lld + %lld properly paired (%.2f%%:%.2f%%)\n", s->n_pair_good[0], s->n_pair_good[1], (float)s->n_pair_good[0] / s->n_pair_all[0] * 100.0, (float)s->n_pair_good[1] / s->n_pair_all[1] * 100.0);
+ printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]);
+ printf("%lld + %lld singletons (%.2f%%:%.2f%%)\n", s->n_sgltn[0], s->n_sgltn[1], (float)s->n_sgltn[0] / s->n_pair_all[0] * 100.0, (float)s->n_sgltn[1] / s->n_pair_all[1] * 100.0);
+ printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]);
+ printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]);
free(s);
bam_header_destroy(header);
bam_close(fp);
int main_cut_target(int argc, char *argv[]);
int main_phase(int argc, char *argv[]);
int main_cat(int argc, char *argv[]);
+int main_depth(int argc, char *argv[]);
int faidx_main(int argc, char *argv[]);
int glf3_view_main(int argc, char *argv[]);
fprintf(stderr, " sort sort alignment file\n");
fprintf(stderr, " pileup generate pileup output\n");
fprintf(stderr, " mpileup multi-way pileup\n");
+ fprintf(stderr, " depth compute the depth\n");
fprintf(stderr, " faidx index/extract FASTA\n");
#if _CURSES_LIB != 0
fprintf(stderr, " tview text alignment viewer\n");
else if (strcmp(argv[1], "cat") == 0) return main_cat(argc-1, argv+1);
else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1);
else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1);
+ else if (strcmp(argv[1], "depth") == 0) return main_depth(argc-1, argv+1);
#if _CURSES_LIB != 0
else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
#endif
DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE
LOBJS= bcf.o vcf.o bcfutils.o prob1.o ld.o kfunc.o index.o fet.o bcf2qcall.o
OMISC= ..
-AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o
+AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o $(OMISC)/bedidx.o
PROG= bcftools
INCLUDES=
SUBDIRS= .
#include "kstring.h"
#include "time.h"
-#include "khash.h"
-KHASH_SET_INIT_INT64(set64)
-
#include "kseq.h"
KSTREAM_INIT(gzFile, gzread, 16384)
typedef struct {
int flag, prior_type, n1, n_sub, *sublist, n_perm;
- char *fn_list, *prior_file, **subsam, *fn_dict;
+ char *prior_file, **subsam, *fn_dict;
uint8_t *ploidy;
double theta, pref, indel_frac, min_perm_p, min_smpl_frac;
+ void *bed;
} viewconf_t;
-khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h)
-{
- void *str2id;
- gzFile fp;
- kstream_t *ks;
- int ret, dret, lineno = 1;
- kstring_t *str;
- khash_t(set64) *hash = 0;
+void *bed_read(const char *fn);
+void bed_destroy(void *_h);
+int bed_overlap(const void *_h, const char *chr, int beg, int end);
- hash = kh_init(set64);
- str2id = bcf_build_refhash(_h);
- str = calloc(1, sizeof(kstring_t));
- fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
- ks = ks_init(fp);
- while (ks_getuntil(ks, 0, str, &dret) >= 0) {
- int tid = bcf_str2id(str2id, str->s);
- if (tid >= 0 && dret != '\n') {
- if (ks_getuntil(ks, 0, str, &dret) >= 0) {
- uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1);
- kh_put(set64, hash, x, &ret);
- } else break;
- } else fprintf(stderr, "[%s] %s is not a reference name (line %d).\n", __func__, str->s, lineno);
- if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
- if (dret < 0) break;
- ++lineno;
- }
- bcf_str2id_destroy(str2id);
- ks_destroy(ks);
- gzclose(fp);
- free(str->s); free(str);
- return hash;
+static double test_hwe(const double g[3])
+{
+ extern double kf_gammaq(double p, double x);
+ double fexp, chi2, f[3], n;
+ int i;
+ n = g[0] + g[1] + g[2];
+ fexp = (2. * g[2] + g[1]) / (2. * n);
+ if (fexp > 1. - 1e-10) fexp = 1. - 1e-10;
+ if (fexp < 1e-10) fexp = 1e-10;
+ f[0] = n * (1. - fexp) * (1. - fexp);
+ f[1] = n * 2. * fexp * (1. - fexp);
+ f[2] = n * fexp * fexp;
+ for (i = 0, chi2 = 0.; i < 3; ++i)
+ chi2 += (g[i] - f[i]) * (g[i] - f[i]) / f[i];
+ return kf_gammaq(.5, chi2 / 2.);
}
typedef struct {
kputs(b->info, &s);
if (b->info[0]) kputc(';', &s);
// ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih);
- ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih);
+ ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g;G3=%.4g,%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih, pr->g[2], pr->g[1], pr->g[0]);
+ if (n_smpl > 5) {
+ double hwe = test_hwe(pr->g);
+ if (hwe < 0.1) ksprintf(&s, ";HWE=%.4g", hwe);
+ }
if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded);
if (fq < -999) fq = -999;
if (fq > 999) fq = 999;
ksprintf(&s, ";FQ=%.3g", fq);
- if (pr->cmp[0] >= 0.) {
+ if (pr->cmp[0] >= 0.) { // two sample groups
int i, q[3], pq;
for (i = 1; i < 3; ++i) {
double x = pr->cmp[i] + pr->cmp[0]/2.;
pq = (int)(-4.343 * log(pr->p_chi2) + .499);
if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank);
ksprintf(&s, ";QCHI2=%d;PCHI2=%.3g;PC2=%d,%d", pq, q[1], q[2], pr->p_chi2);
+ ksprintf(&s, ";AF2=%.4g,%.4g", 1.-pr->f_em2[0], 1.-pr->f_em2[1]);
// ksprintf(&s, ",%g,%g,%g", pr->cmp[0], pr->cmp[1], pr->cmp[2]);
}
if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
kputs("##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=AF1,"))
kputs("##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the site allele frequency of the first ALT allele\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=G3,"))
+ kputs("##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=HWE,"))
+ kputs("##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Chi^2 based HWE test P-value based on G3\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=CI95,"))
kputs("##INFO=<ID=CI95,Number=2,Type=Float,Description=\"Equal-tail Bayesian credible interval of the site allele frequency at the 95% level\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=PV4,"))
if (!strstr(str.s, "##INFO=<ID=QCHI2,"))
kputs("##INFO=<ID=QCHI2,Number=1,Type=Integer,Description=\"Phred scaled PCHI2.\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=RP,"))
- kputs("##INFO=<ID=RP,Number=1,Type=Integer,Description=\"# permutations yielding a smaller PCHI2.\">\n", &str);
+ kputs("##INFO=<ID=PR,Number=1,Type=Integer,Description=\"# permutations yielding a smaller PCHI2.\">\n", &str);
if (!strstr(str.s, "##FORMAT=<ID=GT,"))
kputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", &str);
if (!strstr(str.s, "##FORMAT=<ID=GQ,"))
bcf_hdr_t *hin, *hout;
int tid, begin, end;
char moder[4], modew[4];
- khash_t(set64) *hash = 0;
tid = begin = end = -1;
memset(&vc, 0, sizeof(viewconf_t));
while ((c = getopt(argc, argv, "FN1:l:cHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:")) >= 0) {
switch (c) {
case '1': vc.n1 = atoi(optarg); break;
- case 'l': vc.fn_list = strdup(optarg); break;
+ case 'l': vc.bed = bed_read(optarg); break;
case 'D': vc.fn_dict = strdup(optarg); break;
case 'F': vc.flag |= VC_FIX_PL; break;
case 'N': vc.flag |= VC_ACGT_ONLY; break;
}
if (argc == optind) {
fprintf(stderr, "\n");
- fprintf(stderr, "Usage: bcftools view [options] <in.bcf> [reg]\n\n");
- fprintf(stderr, "Options: -c SNP calling\n");
- fprintf(stderr, " -v output potential variant sites only (force -c)\n");
- fprintf(stderr, " -g call genotypes at variant sites (force -c)\n");
- fprintf(stderr, " -b output BCF instead of VCF\n");
- fprintf(stderr, " -u uncompressed BCF output (force -b)\n");
- fprintf(stderr, " -S input is VCF\n");
- fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n");
- fprintf(stderr, " -G suppress all individual genotype information\n");
- fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n");
- fprintf(stderr, " -Q output the QCALL likelihood format\n");
- fprintf(stderr, " -L calculate LD for adjacent sites\n");
- fprintf(stderr, " -I skip indels\n");
- fprintf(stderr, " -F PL generated by r921 or before\n");
- fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n");
- fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n");
- fprintf(stderr, " -l FILE list of sites to output [all sites]\n");
- fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta);
- fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac);
- fprintf(stderr, " -p FLOAT variant if P(ref|D)<FLOAT [%.3g]\n", vc.pref);
- fprintf(stderr, " -P STR type of prior: full, cond2, flat [full]\n");
- fprintf(stderr, " -s FILE list of samples to use [all samples]\n");
- fprintf(stderr, " -1 INT number of group-1 samples [0]\n");
- fprintf(stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n");
- fprintf(stderr, " -X FLOAT only perform permutations for P(chi^2)<FLOAT [%g]\n", vc.min_perm_p);
+ fprintf(stderr, "Usage: bcftools view [options] <in.bcf> [reg]\n\n");
+ fprintf(stderr, "Input/output options:\n\n");
+ fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n");
+ fprintf(stderr, " -b output BCF instead of VCF\n");
+ fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n");
+ fprintf(stderr, " -F PL generated by r921 or before (which generate old ordering)\n");
+ fprintf(stderr, " -G suppress all individual genotype information\n");
+ fprintf(stderr, " -l FILE list of sites (chr pos) or regions (BED) to output [all sites]\n");
+ fprintf(stderr, " -L calculate LD for adjacent sites\n");
+ fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n");
+ fprintf(stderr, " -Q output the QCALL likelihood format\n");
+ fprintf(stderr, " -s FILE list of samples to use [all samples]\n");
+ fprintf(stderr, " -S input is VCF\n");
+ fprintf(stderr, " -u uncompressed BCF output (force -b)\n");
+ fprintf(stderr, "\nConsensus/variant calling options:\n\n");
+ fprintf(stderr, " -c SNP calling\n");
+ fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n");
+ fprintf(stderr, " -g call genotypes at variant sites (force -c)\n");
+ fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac);
+ fprintf(stderr, " -I skip indels\n");
+ fprintf(stderr, " -p FLOAT variant if P(ref|D)<FLOAT [%.3g]\n", vc.pref);
+ fprintf(stderr, " -P STR type of prior: full, cond2, flat [full]\n");
+ fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta);
+ fprintf(stderr, " -v output potential variant sites only (force -c)\n");
+ fprintf(stderr, "\nContrast calling and association test options:\n\n");
+ fprintf(stderr, " -1 INT number of group-1 samples [0]\n");
+ fprintf(stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n");
+ fprintf(stderr, " -X FLOAT only perform permutations for P(chi^2)<FLOAT [%g]\n", vc.min_perm_p);
fprintf(stderr, "\n");
return 1;
}
}
if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac
}
- if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, hin);
if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) {
void *str2id = bcf_build_refhash(hout);
if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) {
x = toupper(b->ref[0]);
if (x != 'A' && x != 'C' && x != 'G' && x != 'T') continue;
}
- if (hash) {
- uint64_t x = (uint64_t)b->tid<<32 | b->pos;
- khint_t k = kh_get(set64, hash, x);
- if (kh_size(hash) == 0) break;
- if (k == kh_end(hash)) continue;
- kh_del(set64, hash, k);
- }
+ if (vc.bed && !bed_overlap(vc.bed, hin->ns[b->tid], b->pos, b->pos + strlen(b->ref))) continue;
if (tid >= 0) {
int l = strlen(b->ref);
l = b->pos + (l > 0? l : 1);
kstring_t s;
s.m = s.l = 0; s.s = 0;
if (*b->info) kputc(';', &s);
- ksprintf(&s, "NEIR=%.3f;NEIF=%.3f,%.3f", r2, f[0]+f[2], f[0]+f[1]);
+ ksprintf(&s, "NEIR=%.3f;NEIF4=%.3f,%.3f,%.3f,%.3f", r2, f[0], f[1], f[2], f[3]);
bcf_append_info(b, s.s, s.l);
free(s.s);
}
bcf_hdr_destroy(hin);
bcf_destroy(b); bcf_destroy(blast);
vcf_close(bp); vcf_close(bout);
- if (hash) kh_destroy(set64, hash);
- if (vc.fn_list) free(vc.fn_list);
if (vc.fn_dict) free(vc.fn_dict);
if (vc.ploidy) free(vc.ploidy);
if (vc.n_sub) {
for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]);
free(vc.subsam); free(vc.sublist);
}
+ if (vc.bed) bed_destroy(vc.bed);
if (seeds) free(seeds);
if (p1) bcf_p1_destroy(p1);
return 0;
}
return r;
}
+
+int bcf_main_pwld(int argc, char *argv[])
+{
+ bcf_t *fp;
+ bcf_hdr_t *h;
+ bcf1_t **b, *b0;
+ int i, j, m, n;
+ double f[4];
+ if (argc == 1) {
+ fprintf(stderr, "Usage: bcftools pwld <in.bcf>\n");
+ return 1;
+ }
+ fp = bcf_open(argv[1], "rb");
+ h = bcf_hdr_read(fp);
+ // read the entire BCF
+ m = n = 0; b = 0;
+ b0 = calloc(1, sizeof(bcf1_t));
+ while (bcf_read(fp, h, b0) >= 0) {
+ if (m == n) {
+ m = m? m<<1 : 16;
+ b = realloc(b, sizeof(void*) * m);
+ }
+ b[n] = calloc(1, sizeof(bcf1_t));
+ bcf_cpy(b[n++], b0);
+ }
+ bcf_destroy(b0);
+ // compute pair-wise r^2
+ printf("%d\n", n); // the number of loci
+ for (i = 0; i < n; ++i) {
+ printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1);
+ for (j = 0; j < i; ++j) {
+ double r = bcf_ld_freq(b[i], b[j], f);
+ printf("\t%.3f", r*r);
+ }
+ printf("\t1.000\n");
+ }
+ // free
+ for (i = 0; i < n; ++i) bcf_destroy(b[i]);
+ free(b);
+ bcf_hdr_destroy(h);
+ bcf_close(fp);
+ return 0;
+}
int bcfview(int argc, char *argv[]);
int bcf_main_index(int argc, char *argv[]);
+int bcf_main_pwld(int argc, char *argv[]);
#define BUF_SIZE 0x10000
fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n");
fprintf(stderr, " index index BCF\n");
fprintf(stderr, " cat concatenate BCFs\n");
+ fprintf(stderr, " ld compute all-pair r^2\n");
fprintf(stderr, "\n");
return 1;
}
if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1);
else if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1);
- else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2);
+ else if (strcmp(argv[1], "ld") == 0) return bcf_main_pwld(argc-1, argv+1);
+ else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); // cat is different ...
else {
fprintf(stderr, "[main] Unrecognized command.\n");
return 1;
KSTREAM_INIT(gzFile, gzread, 16384)
#define MC_MAX_EM_ITER 16
-#define MC_EM_EPS 1e-4
+#define MC_EM_EPS 1e-5
#define MC_DEF_INDEL 0.15
unsigned char seq_nt4_table[256] = {
return i;
}
// f0 is the reference allele frequency
-static double mc_freq_iter(double f0, const bcf_p1aux_t *ma)
+static double mc_freq_iter(double f0, const bcf_p1aux_t *ma, int beg, int end)
{
double f, f3[3];
int i;
f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
- for (i = 0, f = 0.; i < ma->n; ++i) {
+ for (i = beg, f = 0.; i < end; ++i) {
double *pdg;
pdg = ma->pdg + i * 3;
f += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2])
/ (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]);
}
- f /= ma->n * 2.;
+ f /= (end - beg) * 2.;
return f;
}
+static double mc_gtfreq_iter(double g[3], const bcf_p1aux_t *ma, int beg, int end)
+{
+ double err, gg[3];
+ int i;
+ gg[0] = gg[1] = gg[2] = 0.;
+ for (i = beg; i < end; ++i) {
+ double *pdg, sum, tmp[3];
+ pdg = ma->pdg + i * 3;
+ tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2];
+ sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg);
+ gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum;
+ }
+ err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]);
+ err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]);
+ g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2];
+ return err;
+}
+
int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
{
double sum, g[3];
for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1)
for (k2 = 0; k2 <= 2*n2; ++k2)
if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, ret)) >= 0) z += y;
+ if (ret[0] + ret[1] + ret[2] < 0.99) // It seems that this may be caused by floating point errors. I do not really understand why...
+ z = 1.0, ret[0] = ret[1] = ret[2] = 1./3;
}
return (double)z;
}
{ // calculate f_em
double flast = rst->f_flat;
for (i = 0; i < MC_MAX_EM_ITER; ++i) {
- rst->f_em = mc_freq_iter(flast, ma);
+ rst->f_em = mc_freq_iter(flast, ma, 0, ma->n);
if (fabs(rst->f_em - flast) < MC_EM_EPS) break;
flast = rst->f_em;
}
+ if (ma->n1 > 0 && ma->n1 < ma->n) {
+ for (k = 0; k < 2; ++k) {
+ flast = rst->f_em;
+ for (i = 0; i < MC_MAX_EM_ITER; ++i) {
+ rst->f_em2[k] = k? mc_freq_iter(flast, ma, ma->n1, ma->n) : mc_freq_iter(flast, ma, 0, ma->n1);
+ if (fabs(rst->f_em2[k] - flast) < MC_EM_EPS) break;
+ flast = rst->f_em2[k];
+ }
+ }
+ }
+ }
+ { // compute g[3]
+ rst->g[0] = (1. - rst->f_em) * (1. - rst->f_em);
+ rst->g[1] = 2. * rst->f_em * (1. - rst->f_em);
+ rst->g[2] = rst->f_em * rst->f_em;
+ for (i = 0; i < MC_MAX_EM_ITER; ++i)
+ if (mc_gtfreq_iter(rst->g, ma, 0, ma->n) < MC_EM_EPS) break;
}
{ // estimate equal-tail credible interval (95% level)
int l, h;
h = i;
rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M;
}
- rst->g[0] = rst->g[1] = rst->g[2] = -1.;
rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0;
if (rst->p_var > 0.1) // skip contrast2() if the locus is a strong non-variant
rst->p_chi2 = contrast2(ma, rst->cmp);
int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal()
double f_em, f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var;
double cil, cih;
- double cmp[3], p_chi2; // used by contrast2()
+ double cmp[3], p_chi2, f_em2[2]; // used by contrast2()
double g[3];
} bcf_p1rst_t;
--- /dev/null
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 8192)
+
+typedef struct {
+ int n, m;
+ uint64_t *a;
+ int *idx;
+} bed_reglist_t;
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(reg, bed_reglist_t)
+
+#define LIDX_SHIFT 13
+
+typedef kh_reg_t reghash_t;
+
+int *bed_index_core(int n, uint64_t *a, int *n_idx)
+{
+ int i, j, m, *idx;
+ m = *n_idx = 0; idx = 0;
+ for (i = 0; i < n; ++i) {
+ int beg, end;
+ beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
+ if (m < end + 1) {
+ int oldm = m;
+ m = end + 1;
+ kroundup32(m);
+ idx = realloc(idx, m * sizeof(int));
+ for (j = oldm; j < m; ++j) idx[j] = -1;
+ }
+ if (beg == end) {
+ if (idx[beg] < 0) idx[beg] = i;
+ } else {
+ for (j = beg; j <= end; ++j)
+ if (idx[j] < 0) idx[j] = i;
+ }
+ *n_idx = end + 1;
+ }
+ return idx;
+}
+
+void bed_index(void *_h)
+{
+ reghash_t *h = (reghash_t*)_h;
+ khint_t k;
+ for (k = 0; k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ bed_reglist_t *p = &kh_val(h, k);
+ if (p->idx) free(p->idx);
+ p->idx = bed_index_core(p->n, p->a, &p->m);
+ }
+ }
+}
+
+int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
+{
+ int i, min_off;
+ if (p->n == 0) return 0;
+ min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
+ if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
+ int n = beg>>LIDX_SHIFT;
+ if (n > p->n) n = p->n;
+ for (i = n - 1; i >= 0; --i)
+ if (p->idx[i] >= 0) break;
+ min_off = i >= 0? p->idx[i] : 0;
+ }
+ for (i = min_off; i < p->n; ++i) {
+ if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
+ if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
+ return 1; // find the overlap; return
+ }
+ return 0;
+}
+
+int bed_overlap(const void *_h, const char *chr, int beg, int end)
+{
+ const reghash_t *h = (const reghash_t*)_h;
+ khint_t k;
+ if (!h) return 0;
+ k = kh_get(reg, h, chr);
+ if (k == kh_end(h)) return 0;
+ return bed_overlap_core(&kh_val(h, k), beg, end);
+}
+
+void *bed_read(const char *fn)
+{
+ reghash_t *h = kh_init(reg);
+ gzFile fp;
+ kstream_t *ks;
+ int dret;
+ kstring_t *str;
+ // read the list
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ if (fp == 0) return 0;
+ str = calloc(1, sizeof(kstring_t));
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
+ int beg = -1, end = -1;
+ bed_reglist_t *p;
+ khint_t k = kh_get(reg, h, str->s);
+ if (k == kh_end(h)) { // absent from the hash table
+ int ret;
+ char *s = strdup(str->s);
+ k = kh_put(reg, h, s, &ret);
+ memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
+ }
+ p = &kh_val(h, k);
+ if (dret != '\n') { // if the lines has other characters
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+ beg = atoi(str->s); // begin
+ if (dret != '\n') {
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0]))
+ end = atoi(str->s); // end
+ }
+ }
+ }
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
+ if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
+ if (beg >= 0 && end > beg) {
+ if (p->n == p->m) {
+ p->m = p->m? p->m<<1 : 4;
+ p->a = realloc(p->a, p->m * 8);
+ }
+ p->a[p->n++] = (uint64_t)beg<<32 | end;
+ }
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ bed_index(h);
+ return h;
+}
+
+void bed_destroy(void *_h)
+{
+ reghash_t *h = (reghash_t*)_h;
+ khint_t k;
+ for (k = 0; k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ free(kh_val(h, k).a);
+ free(kh_val(h, k).idx);
+ free((char*)kh_key(h, k));
+ }
+ }
+ kh_destroy(reg, h);
+}
fp->error = message;
}
+int bgzf_check_bgzf(const char *fn)
+{
+ BGZF *fp;
+ uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377";
+ int n;
+
+ if ((fp = bgzf_open(fn, "r")) == 0)
+ {
+ fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn);
+ return -1;
+ }
+
+#ifdef _USE_KNETFILE
+ n = knet_read(fp->x.fpr, buf, 10);
+#else
+ n = fread(buf, 1, 10, fp->file);
+#endif
+ bgzf_close(fp);
+
+ if ( n!=10 )
+ return -1;
+
+ if ( !memcmp(magic, buf, 10) ) return 1;
+ return 0;
+}
+
static BGZF *bgzf_read_init()
{
BGZF *fp;
int bgzf_read_block(BGZF* fp);
int bgzf_flush(BGZF* fp);
int bgzf_flush_try(BGZF *fp, int size);
+int bgzf_check_bgzf(const char *fn);
#ifdef __cplusplus
}
append_header_text(fp->header, textheader->text, textheader->l_text);
bam_header_destroy(textheader);
}
- if (fp->header->n_targets == 0)
+ if (fp->header->n_targets == 0 && bam_verbose >= 1)
fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
- } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
+ } else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
}
} else if (strchr(mode, 'w')) { // write
fp->header = bam_header_dup((const bam_header_t*)aux);
sam_header_parse(alt);
alt->l_text = 0; alt->text = 0;
// check if there are @SQ lines in the header
- fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw);
+ fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL
if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
- if (alt->n_targets != fp->header->n_targets)
- fprintf(stderr, "[samopen] inconsistent number of target sequences.\n");
+ if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1)
+ fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n");
} else { // then dump ->target_{name,len}
for (i = 0; i < fp->header->n_targets; ++i)
fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
if (access(fn_ref, R_OK) == -1) {
fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref);
} else {
- fprintf(stderr, "[samfaipath] build FASTA index...\n");
+ if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n");
if (fai_build(fn_ref) == -1) {
fprintf(stderr, "[samfaipath] fail to build FASTA index.\n");
free(fn_list); fn_list = 0;
const char *text;
char *buf=NULL;
size_t nbuf = 0;
+ int tovalidate = 0;
if ( !headerText )
return 0;
while ( (text=nextline(&buf, &nbuf, text)) )
{
hline = sam_header_line_parse(buf);
- if ( hline && sam_header_line_validate(hline) )
+ if ( hline && (!tovalidate || sam_header_line_validate(hline)) )
// With too many (~250,000) reference sequences the header parsing was too slow with list_append.
hlines = list_append_to_end(hlines, hline);
else
#include "sam_header.h"
#include "sam.h"
#include "faidx.h"
+#include "kstring.h"
#include "khash.h"
KHASH_SET_INIT_STR(rg)
typedef khash_t(rg) *rghash_t;
-rghash_t g_rghash = 0;
+static rghash_t g_rghash = 0;
static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0;
static char *g_library, *g_rg;
static int g_sol2sanger_tbl[128];
+static void *g_bed;
+
+void *bed_read(const char *fn);
+void bed_destroy(void *_h);
+int bed_overlap(const void *_h, const char *chr, int beg, int end);
static void sol2sanger(bam1_t *b)
{
{
if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off))
return 1;
+ if (g_bed && b->core.tid >= 0 && !bed_overlap(g_bed, h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))))
+ return 1;
if (g_rg || g_rghash) {
uint8_t *s = bam_aux_get(b, "RG");
if (s) {
return 0;
}
+static char *drop_rg(char *hdtxt, rghash_t h, int *len)
+{
+ char *p = hdtxt, *q, *r, *s;
+ kstring_t str;
+ memset(&str, 0, sizeof(kstring_t));
+ while (1) {
+ int toprint = 0;
+ q = strchr(p, '\n');
+ if (q == 0) q = p + strlen(p);
+ if (q - p < 3) break; // the line is too short; then stop
+ if (strncmp(p, "@RG\t", 4) == 0) {
+ int c;
+ khint_t k;
+ if ((r = strstr(p, "\tID:")) != 0) {
+ r += 4;
+ for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s);
+ c = *s; *s = '\0';
+ k = kh_get(rg, h, r);
+ *s = c;
+ if (k != kh_end(h)) toprint = 1;
+ }
+ } else toprint = 1;
+ if (toprint) {
+ kputsn(p, q - p, &str); kputc('\n', &str);
+ }
+ p = q + 1;
+ }
+ *len = str.l;
+ return str.s;
+}
+
// callback function for bam_fetch() that prints nonskipped records
static int view_func(const bam1_t *b, void *data)
{
/* parse command-line options */
strcpy(in_mode, "r"); strcpy(out_mode, "w");
- while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:CR:")) >= 0) {
+ while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:CR:L:")) >= 0) {
switch (c) {
case 'c': is_count = 1; break;
case 'C': slx2sngr = 1; break;
case 'u': compress_level = 0; break;
case '1': compress_level = 1; break;
case 'l': g_library = strdup(optarg); break;
+ case 'L': g_bed = bed_read(optarg); break;
case 'r': g_rg = strdup(optarg); break;
case 'R': fn_rg = strdup(optarg); break;
case 'x': of_type = BAM_OFHEX; break;
ret = 1;
goto view_end;
}
+ if (g_rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for...
+ char *tmp;
+ int l;
+ tmp = drop_rg(in->header->text, g_rghash, &l);
+ free(in->header->text);
+ in->header->text = tmp;
+ in->header->l_text = l;
+ }
if (!is_count && (out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) {
fprintf(stderr, "[main_samview] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output");
ret = 1;
}
// close files, free and return
free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg);
+ if (g_bed) bed_destroy(g_bed);
if (g_rghash) {
khint_t k;
for (k = 0; k < kh_end(g_rghash); ++k)
fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n");
fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n");
fprintf(stderr, " -c print only the count of matching records\n");
+ fprintf(stderr, " -L FILE output alignments overlapping the input BED FILE [null]\n");
fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n");
fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n");
fprintf(stderr, " -o FILE output file name [stdout]\n");
++n;
}
if (n == 0) add_pair(sm, sm2id, fn, fn);
+// add_pair(sm, sm2id, fn, fn);
free(buf.s);
return 0;
}