From e42445c748ed193f8d0a4277a844fa848f30c132 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 12 Apr 2011 19:40:07 +0900 Subject: [PATCH] Imported Upstream version 0.1.15 --- Makefile | 4 +- NEWS | 69 +++++++++++++++++++-- bam.c | 2 +- bam.h | 10 ++- bam2depth.c | 112 ++++++++++++++++++++++++++++++++++ bam_plcmd.c | 102 +++++++++++++++++-------------- bam_sort.c | 13 ++-- bam_stat.c | 55 ++++++++--------- bamtk.c | 3 + bcftools/Makefile | 2 +- bcftools/call1.c | 139 +++++++++++++++++++----------------------- bcftools/ld.c | 43 +++++++++++++ bcftools/main.c | 5 +- bcftools/prob1.c | 48 +++++++++++++-- bcftools/prob1.h | 2 +- bedidx.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++ bgzf.c | 26 ++++++++ bgzf.h | 1 + sam.c | 12 ++-- sam_header.c | 3 +- sam_view.c | 54 +++++++++++++++- sample.c | 1 + 22 files changed, 674 insertions(+), 184 deletions(-) create mode 100644 bam2depth.c create mode 100644 bedidx.c diff --git a/Makefile b/Makefile index f025942..4fc03d5 100644 --- a/Makefile +++ b/Makefile @@ -3,12 +3,12 @@ CFLAGS= -g -Wall -O2 #-m64 #-arch ppc DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=1 KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ - bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \ + bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o bedidx.o \ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ - cut_target.o phase.o + cut_target.o phase.o bam2depth.o PROG= samtools INCLUDES= -I. SUBDIRS= . bcftools misc diff --git a/NEWS b/NEWS index 946ba7b..de55679 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,37 @@ -Beta release 0.1.14 (16 March, 2011) +Beta Release 0.1.15 (10 April, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Noteable changes: + + * Allow to perform variant calling or to extract information in multiple + regions specified by a BED file (`samtools mpileup -l', `samtools view -L' + and `bcftools view -l'). + + * Added the `depth' command to samtools to compute the per-base depth with a + simpler interface. File `bam2depth.c', which implements this command, is the + recommended example on how to use the mpileup APIs. + + * Estimate genotype frequencies with ML; perform chi^2 based Hardy-Weinberg + test using this estimate. + + * For `samtools view', when `-R' is specified, drop read groups in the header + that are not contained in the specified file. + + * For `samtools flagstat', separate QC-pass and QC-fail reads. + + * Improved the command line help of `samtools mpileup' and `bcftools view'. + + * Use a global variable to control the verbose level of samtools stderr + output. Nonetheless, it has not been full utilized. + + * Fixed an issue in association test which may report false associations, + possibly due to floating point underflow. + +(0.1.15: 10 April 2011, r949:203) + + + +Beta release 0.1.14 (21 March, 2011) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This release implements a method for testing associations for case-control @@ -9,17 +42,41 @@ has not been evaluated on real data. Another new feature is to make X chromosome variant calls when female and male samples are both present. The user needs to provide a file indicating the -ploidy of each sample. +ploidy of each sample (see also manual bcftools/bcftools.1). + +Other notable changes: + + * Added `bcftools view -F' to parse BCF files generated by samtools r921 or + older which encodes PL in a different way. -Other new features: + * Changed the behavior of `bcftools view -s'. Now when a list of samples is + provided, the samples in the output will be reordered to match the ordering + in the sample list. This change is mainly designed for association test. + + * Sped up `bcftools view -v' for target sequencing given thousands of samples. + Also added a new option `view -d' to skip loci where only a few samples are + covered by reads. + + * Dropped HWE test. This feature has never been implemented properly. An EM + should be much better. To be implemented in future. + + * Added the `cat' command to samtools. This command concatenate BAMs with + identical sequence dictionaries in an efficient way. Modified from bam_cat.c + written by Chris Saunders. + + * Added `samtools view -1' to write BAMs at a low compression level but twice + faster to create. The `sort' command generates temporary files at a low + compression level as well. + + * Added `samtools mpileup -6' to accept "BAM" with Illumina 1.3+ quality + strings (strictly speaking, such a file is not BAM). * Added `samtools mpileup -L' to skip INDEL calling in regions with excessively high coverage. Such regions dramatically slow down mpileup. - * Added `bcftools view -F' to parse BCF files generated by samtools r921 or - older which encode PL in a different way. + * Updated `misc/export2sam.pl', provided by Chris Saunders from Illumina Inc. -(0.1.14: 16 March 2011, r930:163) +(0.1.14: 21 March 2011, r933:170) diff --git a/bam.c b/bam.c index 96aace2..a65aed5 100644 --- a/bam.c +++ b/bam.c @@ -7,7 +7,7 @@ #include "kstring.h" #include "sam_header.h" -int bam_is_be = 0; +int bam_is_be = 0, bam_verbose = 2; char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0"; /************************** diff --git a/bam.h b/bam.h index bc8e3f1..87e3de3 100644 --- a/bam.h +++ b/bam.h @@ -33,14 +33,14 @@ BAM library provides I/O and various operations on manipulating files in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) - format. It now supports importing from or exporting to TAM, sorting, + format. It now supports importing from or exporting to SAM, sorting, merging, generating pileup, and quickly retrieval of reads overlapped with a specified region. @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.14 (r933:170)" +#define BAM_VERSION "0.1.15 (r949:203)" #include #include @@ -264,6 +264,12 @@ typedef struct __bam_iter_t *bam_iter_t; */ extern int bam_is_be; +/*! + @abstract Verbose level between 0 and 3; 0 is supposed to disable all + debugging information, though this may not have been implemented. + */ +extern int bam_verbose; + /*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ extern unsigned char bam_nt16_table[256]; diff --git a/bam2depth.c b/bam2depth.c new file mode 100644 index 0000000..ca36b89 --- /dev/null +++ b/bam2depth.c @@ -0,0 +1,112 @@ +/* This program demonstrates how to generate pileup from multiple BAMs + * simutaneously, to achieve random access and to use the BED interface. + * To compile this program separately, you may: + * + * gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -L. -lbam -lz + */ +#include +#include +#include +#include +#include "bam.h" + +typedef struct { // auxiliary data structure + bamFile fp; // the file handler + bam_iter_t iter; // NULL if a region not specified + int min_mapQ; // mapQ filter +} aux_t; + +void *bed_read(const char *fn); // read a BED or position list file +void bed_destroy(void *_h); // destroy the BED data structure +int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps + +// This function reads a BAM alignment from one BAM file. +static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup +{ + aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure + int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); + if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; + return ret; +} + +#ifdef _MAIN_BAM2DEPTH +int main(int argc, char *argv[]) +#else +int main_depth(int argc, char *argv[]) +#endif +{ + int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0; + const bam_pileup1_t **plp; + char *reg = 0; // specified region + void *bed = 0; // BED data structure + bam_header_t *h = 0; // BAM header of the 1st input + aux_t **data; + bam_mplp_t mplp; + + // parse the command line + while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) { + switch (n) { + case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header + case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now + case 'q': baseQ = atoi(optarg); break; // base quality threshold + case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold + } + } + if (optind == argc) { + fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] [...]\n"); + return 1; + } + + // initialize the auxiliary data structures + n = argc - optind; // the number of BAMs on the command line + data = calloc(n, sizeof(void*)); // data[i] for the i-th input + beg = 0; end = 1<<30; tid = -1; // set the default region + for (i = 0; i < n; ++i) { + bam_header_t *htmp; + data[i] = calloc(1, sizeof(aux_t)); + data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM + data[i]->min_mapQ = mapQ; // set the mapQ filter + htmp = bam_header_read(data[i]->fp); // read the BAM header + if (i == 0) { + h = htmp; // keep the header of the 1st BAM + if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region + } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header + if (tid >= 0) { // if a region is specified and parsed successfully + bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index + data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator + bam_index_destroy(idx); // the index is not needed any more; phase out of the memory + } + } + + // the core multi-pileup loop + mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization + n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM + plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp) + while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position + if (pos < beg || pos >= end) continue; // out of range; skip + if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip + fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster + for (i = 0; i < n; ++i) { // base level filters have to go here + int j, m = 0; + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know + if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos + else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality + } + printf("\t%d", n_plp[i] - m); // this the depth to output + } + putchar('\n'); + } + free(n_plp); free(plp); + bam_mplp_destroy(mplp); + + bam_header_destroy(h); + for (i = 0; i < n; ++i) { + bam_close(data[i]->fp); + if (data[i]->iter) bam_iter_destroy(data[i]->iter); + free(data[i]); + } + free(data); free(reg); + if (bed) bed_destroy(bed); + return 0; +} diff --git a/bam_plcmd.c b/bam_plcmd.c index fd75cec..f6381c3 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -437,6 +437,7 @@ int bam_pileup(int argc, char *argv[]) fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c) [%.4g]\n", d->ido->r_indel); fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c) [%d]\n", d->ido->q_indel); fprintf(stderr, "\n"); + fprintf(stderr, "Warning: Please use the `mpileup' command instead `pileup'. `Pileup' is deprecated!\n\n"); free(fn_list); free(fn_fa); free(d); return 1; } @@ -536,19 +537,23 @@ int bam_pileup(int argc, char *argv[]) #define MPLP_EXT_BAQ 0x800 #define MPLP_ILLUMINA13 0x1000 +void *bed_read(const char *fn); +void bed_destroy(void *_h); +int bed_overlap(const void *_h, const char *chr, int beg, int end); + typedef struct { int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels - char *reg, *fn_pos, *pl_list; + char *reg, *pl_list; faidx_t *fai; - kh_64_t *hash; - void *rghash; + void *bed, *rghash; } mplp_conf_t; typedef struct { bamFile fp; bam_iter_t iter; + bam_header_t *h; int ref_id; char *ref; const mplp_conf_t *conf; @@ -571,6 +576,14 @@ static int mplp_func(void *data, bam1_t *b) int has_ref; ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b); if (ret < 0) break; + if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) { // exclude unmapped reads + skip = 1; + continue; + } + if (ma->conf->bed) { // test overlap + skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))); + if (skip) continue; + } if (ma->conf->rghash) { // exclude read groups uint8_t *rg = bam_aux_get(b, "RG"); skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0); @@ -589,7 +602,7 @@ static int mplp_func(void *data, bam1_t *b) int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres); if (q < 0) skip = 1; else if (b->core.qual > q) b->core.qual = q; - } else if (b->core.flag&BAM_FUNMAP) skip = 1; + } else if (b->core.qual < ma->conf->min_mq) skip = 1; else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1; } while (skip); @@ -609,7 +622,11 @@ static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, q = bam_aux_get(p->b, "RG"); if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf); if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf); - assert(id >= 0 && id < m->n); + if (id < 0 || id >= m->n) { + assert(q); // otherwise a bug + fprintf(stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]); + exit(1); + } if (m->n_plp[id] == m->m_plp[id]) { m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; m->plp[id] = realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]); @@ -629,7 +646,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bam_mplp_t iter; bam_header_t *h = 0; char *ref; - khash_t(64) *hash = 0; void *rghash = 0; bcf_callaux_t *bca = 0; @@ -657,6 +673,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); + data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { @@ -687,7 +704,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); - if (conf->fn_pos) hash = load_pos(conf->fn_pos, h); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; @@ -727,17 +743,13 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; - fprintf(stderr, "<%s> Set max per-sample depth to %d\n", __func__, max_depth); + fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested - if (hash) { - khint_t k; - k = kh_get(64, hash, (uint64_t)tid<<32 | pos); - if (k == kh_end(hash)) continue; - } + if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = fai_fetch(conf->fai, h->target_name[tid], &ref_len); @@ -797,12 +809,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); - if (hash) { // free the hash table - khint_t k; - for (k = kh_begin(hash); k < kh_end(hash); ++k) - if (kh_exist(hash, k)) free(kh_val(hash, k)); - kh_destroy(64, hash); - } bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); @@ -887,7 +893,7 @@ int bam_mpileup(int argc, char *argv[]) break; case 'd': mplp.max_depth = atoi(optarg); break; case 'r': mplp.reg = strdup(optarg); break; - case 'l': mplp.fn_pos = strdup(optarg); break; + case 'l': mplp.bed = bed_read(optarg); break; case 'P': mplp.pl_list = strdup(optarg); break; case 'g': mplp.flag |= MPLP_GLF; break; case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break; @@ -927,32 +933,35 @@ int bam_mpileup(int argc, char *argv[]) if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; if (argc == 1) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); - fprintf(stderr, "Options: -f FILE reference sequence file [null]\n"); - fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); - fprintf(stderr, " -l FILE list of positions (format: chr pos) [null]\n"); - fprintf(stderr, " -b FILE list of input BAM files [null]\n"); - fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); - fprintf(stderr, " -Q INT min base quality [%d]\n", mplp.min_baseQ); - fprintf(stderr, " -q INT filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq); - fprintf(stderr, " -d INT max per-BAM depth [%d]\n", mplp.max_depth); - fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); - fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); - fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); - fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); - fprintf(stderr, " -h INT coefficient for homopolyer errors [%d]\n", mplp.tandemQ); - fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); - fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac); - fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); - fprintf(stderr, " -6 quality is in the Illumina-1.3+ encoding\n"); - fprintf(stderr, " -A use anomalous read pairs in SNP/INDEL calling\n"); - fprintf(stderr, " -g generate BCF output\n"); - fprintf(stderr, " -u do not compress BCF output\n"); - fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n"); - fprintf(stderr, " -B disable BAQ computation\n"); - fprintf(stderr, " -D output per-sample DP\n"); - fprintf(stderr, " -S output per-sample SP (strand bias P-value, slow)\n"); - fprintf(stderr, " -I do not perform indel calling\n"); + fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); + fprintf(stderr, "Input options:\n\n"); + fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n"); + fprintf(stderr, " -A count anomalous read pairs\n"); + fprintf(stderr, " -B disable BAQ computation\n"); + fprintf(stderr, " -b FILE list of input BAM files [null]\n"); + fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth); + fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n"); + fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n"); + fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); + fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n"); + fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); + fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); + fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq); + fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ); + fprintf(stderr, "\nOutput options:\n\n"); + fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n"); + fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n"); + fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n"); + fprintf(stderr, " -u generate uncompress BCF output\n"); + fprintf(stderr, "\nSNP/INDEL genotype likelihoods options (effective with `-g' or `-u'):\n\n"); + fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); + fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac); + fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ); + fprintf(stderr, " -I do not perform indel calling\n"); + fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); + fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); + fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); + fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Notes: Assuming diploid individuals.\n\n"); return 1; @@ -966,5 +975,6 @@ int bam_mpileup(int argc, char *argv[]) if (mplp.rghash) bcf_str2id_thorough_destroy(mplp.rghash); free(mplp.reg); free(mplp.pl_list); if (mplp.fai) fai_destroy(mplp.fai); + if (mplp.bed) bed_destroy(mplp.bed); return 0; } diff --git a/bam_sort.c b/bam_sort.c index 7aeccff..94970b5 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -70,6 +70,7 @@ static void swap_header_text(bam_header_t *h1, bam_header_t *h2) #define MERGE_RG 1 #define MERGE_UNCOMP 2 +#define MERGE_LEVEL1 4 /*! @abstract Merge multiple sorted BAM. @@ -207,11 +208,9 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch } else h->pos = HEAP_EMPTY; } - if (flag & MERGE_UNCOMP) { - fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu"); - } else { - fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); - } + if (flag & MERGE_UNCOMP) fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu"); + else if (flag & MERGE_LEVEL1) fpout = strcmp(out, "-")? bam_open(out, "w1") : bam_dopen(fileno(stdout), "w1"); + else fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); if (fpout == 0) { fprintf(stderr, "[%s] fail to create the output file.\n", __func__); return -1; @@ -257,11 +256,12 @@ int bam_merge(int argc, char *argv[]) int c, is_by_qname = 0, flag = 0, ret = 0; char *fn_headers = NULL, *reg = 0; - while ((c = getopt(argc, argv, "h:nruR:")) >= 0) { + while ((c = getopt(argc, argv, "h:nru1R:")) >= 0) { switch (c) { case 'r': flag |= MERGE_RG; break; case 'h': fn_headers = strdup(optarg); break; case 'n': is_by_qname = 1; break; + case '1': flag |= MERGE_LEVEL1; break; case 'u': flag |= MERGE_UNCOMP; break; case 'R': reg = strdup(optarg); break; } @@ -272,6 +272,7 @@ int bam_merge(int argc, char *argv[]) fprintf(stderr, "Options: -n sort by read names\n"); fprintf(stderr, " -r attach RG tag (inferred from file names)\n"); fprintf(stderr, " -u uncompressed BAM output\n"); + fprintf(stderr, " -1 compress level 1\n"); fprintf(stderr, " -R STR merge file in the specified region STR [all]\n"); fprintf(stderr, " -h FILE copy the header in FILE to [in1.bam]\n\n"); fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n"); diff --git a/bam_stat.c b/bam_stat.c index ea9deee..f2de0f1 100644 --- a/bam_stat.c +++ b/bam_stat.c @@ -3,31 +3,31 @@ #include "bam.h" typedef struct { - long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good; - long long n_sgltn, n_read1, n_read2; - long long n_qcfail, n_dup; - long long n_diffchr, n_diffhigh; + long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2]; + long long n_sgltn[2], n_read1[2], n_read2[2]; + long long n_dup[2]; + long long n_diffchr[2], n_diffhigh[2]; } bam_flagstat_t; #define flagstat_loop(s, c) do { \ - ++(s)->n_reads; \ + int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \ + ++(s)->n_reads[w]; \ if ((c)->flag & BAM_FPAIRED) { \ - ++(s)->n_pair_all; \ - if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good; \ - if ((c)->flag & BAM_FREAD1) ++(s)->n_read1; \ - if ((c)->flag & BAM_FREAD2) ++(s)->n_read2; \ - if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn; \ + ++(s)->n_pair_all[w]; \ + if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good[w]; \ + if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \ + if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \ + if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \ if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ - ++(s)->n_pair_map; \ + ++(s)->n_pair_map[w]; \ if ((c)->mtid != (c)->tid) { \ - ++(s)->n_diffchr; \ - if ((c)->qual >= 5) ++(s)->n_diffhigh; \ + ++(s)->n_diffchr[w]; \ + if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \ } \ } \ } \ - if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped; \ - if ((c)->flag & BAM_FQCFAIL) ++(s)->n_qcfail; \ - if ((c)->flag & BAM_FDUP) ++(s)->n_dup; \ + if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \ + if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ } while (0) bam_flagstat_t *bam_flagstat_core(bamFile fp) @@ -59,18 +59,17 @@ int bam_flagstat(int argc, char *argv[]) assert(fp); header = bam_header_read(fp); s = bam_flagstat_core(fp); - printf("%lld in total\n", s->n_reads); - printf("%lld QC failure\n", s->n_qcfail); - printf("%lld duplicates\n", s->n_dup); - printf("%lld mapped (%.2f%%)\n", s->n_mapped, (float)s->n_mapped / s->n_reads * 100.0); - printf("%lld paired in sequencing\n", s->n_pair_all); - printf("%lld read1\n", s->n_read1); - printf("%lld read2\n", s->n_read2); - printf("%lld properly paired (%.2f%%)\n", s->n_pair_good, (float)s->n_pair_good / s->n_pair_all * 100.0); - printf("%lld with itself and mate mapped\n", s->n_pair_map); - printf("%lld singletons (%.2f%%)\n", s->n_sgltn, (float)s->n_sgltn / s->n_pair_all * 100.0); - printf("%lld with mate mapped to a different chr\n", s->n_diffchr); - printf("%lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh); + printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); + printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); + printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0); + printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); + printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); + printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); + printf("%lld + %lld properly paired (%.2f%%:%.2f%%)\n", s->n_pair_good[0], s->n_pair_good[1], (float)s->n_pair_good[0] / s->n_pair_all[0] * 100.0, (float)s->n_pair_good[1] / s->n_pair_all[1] * 100.0); + printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); + printf("%lld + %lld singletons (%.2f%%:%.2f%%)\n", s->n_sgltn[0], s->n_sgltn[1], (float)s->n_sgltn[0] / s->n_pair_all[0] * 100.0, (float)s->n_sgltn[1] / s->n_pair_all[1] * 100.0); + printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); + printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); free(s); bam_header_destroy(header); bam_close(fp); diff --git a/bamtk.c b/bamtk.c index 5309d5e..5886570 100644 --- a/bamtk.c +++ b/bamtk.c @@ -26,6 +26,7 @@ int main_reheader(int argc, char *argv[]); int main_cut_target(int argc, char *argv[]); int main_phase(int argc, char *argv[]); int main_cat(int argc, char *argv[]); +int main_depth(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); int glf3_view_main(int argc, char *argv[]); @@ -80,6 +81,7 @@ static int usage() fprintf(stderr, " sort sort alignment file\n"); fprintf(stderr, " pileup generate pileup output\n"); fprintf(stderr, " mpileup multi-way pileup\n"); + fprintf(stderr, " depth compute the depth\n"); fprintf(stderr, " faidx index/extract FASTA\n"); #if _CURSES_LIB != 0 fprintf(stderr, " tview text alignment viewer\n"); @@ -136,6 +138,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "cat") == 0) return main_cat(argc-1, argv+1); else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1); else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1); + else if (strcmp(argv[1], "depth") == 0) return main_depth(argc-1, argv+1); #if _CURSES_LIB != 0 else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); #endif diff --git a/bcftools/Makefile b/bcftools/Makefile index e579a9e..b5a1b1b 100644 --- a/bcftools/Makefile +++ b/bcftools/Makefile @@ -3,7 +3,7 @@ CFLAGS= -g -Wall -O2 #-m64 #-arch ppc DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE LOBJS= bcf.o vcf.o bcfutils.o prob1.o ld.o kfunc.o index.o fet.o bcf2qcall.o OMISC= .. -AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o +AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o $(OMISC)/bedidx.o PROG= bcftools INCLUDES= SUBDIRS= . diff --git a/bcftools/call1.c b/bcftools/call1.c index e074fb5..d61d2c4 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -8,9 +8,6 @@ #include "kstring.h" #include "time.h" -#include "khash.h" -KHASH_SET_INIT_INT64(set64) - #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) @@ -31,42 +28,31 @@ KSTREAM_INIT(gzFile, gzread, 16384) typedef struct { int flag, prior_type, n1, n_sub, *sublist, n_perm; - char *fn_list, *prior_file, **subsam, *fn_dict; + char *prior_file, **subsam, *fn_dict; uint8_t *ploidy; double theta, pref, indel_frac, min_perm_p, min_smpl_frac; + void *bed; } viewconf_t; -khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h) -{ - void *str2id; - gzFile fp; - kstream_t *ks; - int ret, dret, lineno = 1; - kstring_t *str; - khash_t(set64) *hash = 0; +void *bed_read(const char *fn); +void bed_destroy(void *_h); +int bed_overlap(const void *_h, const char *chr, int beg, int end); - hash = kh_init(set64); - str2id = bcf_build_refhash(_h); - str = calloc(1, sizeof(kstring_t)); - fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { - int tid = bcf_str2id(str2id, str->s); - if (tid >= 0 && dret != '\n') { - if (ks_getuntil(ks, 0, str, &dret) >= 0) { - uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1); - kh_put(set64, hash, x, &ret); - } else break; - } else fprintf(stderr, "[%s] %s is not a reference name (line %d).\n", __func__, str->s, lineno); - if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); - if (dret < 0) break; - ++lineno; - } - bcf_str2id_destroy(str2id); - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - return hash; +static double test_hwe(const double g[3]) +{ + extern double kf_gammaq(double p, double x); + double fexp, chi2, f[3], n; + int i; + n = g[0] + g[1] + g[2]; + fexp = (2. * g[2] + g[1]) / (2. * n); + if (fexp > 1. - 1e-10) fexp = 1. - 1e-10; + if (fexp < 1e-10) fexp = 1e-10; + f[0] = n * (1. - fexp) * (1. - fexp); + f[1] = n * 2. * fexp * (1. - fexp); + f[2] = n * fexp * fexp; + for (i = 0, chi2 = 0.; i < 3; ++i) + chi2 += (g[i] - f[i]) * (g[i] - f[i]) / f[i]; + return kf_gammaq(.5, chi2 / 2.); } typedef struct { @@ -148,13 +134,17 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p kputs(b->info, &s); if (b->info[0]) kputc(';', &s); // ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih); - ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih); + ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g;G3=%.4g,%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih, pr->g[2], pr->g[1], pr->g[0]); + if (n_smpl > 5) { + double hwe = test_hwe(pr->g); + if (hwe < 0.1) ksprintf(&s, ";HWE=%.4g", hwe); + } if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded); if (fq < -999) fq = -999; if (fq > 999) fq = 999; ksprintf(&s, ";FQ=%.3g", fq); - if (pr->cmp[0] >= 0.) { + if (pr->cmp[0] >= 0.) { // two sample groups int i, q[3], pq; for (i = 1; i < 3; ++i) { double x = pr->cmp[i] + pr->cmp[0]/2.; @@ -164,6 +154,7 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p pq = (int)(-4.343 * log(pr->p_chi2) + .499); if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank); ksprintf(&s, ";QCHI2=%d;PCHI2=%.3g;PC2=%d,%d", pq, q[1], q[2], pr->p_chi2); + ksprintf(&s, ";AF2=%.4g,%.4g", 1.-pr->f_em2[0], 1.-pr->f_em2[1]); // ksprintf(&s, ",%g,%g,%g", pr->cmp[0], pr->cmp[1], pr->cmp[2]); } if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); @@ -246,6 +237,10 @@ static void write_header(bcf_hdr_t *h) kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + kputs("##INFO=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT== 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; - case 'l': vc.fn_list = strdup(optarg); break; + case 'l': vc.bed = bed_read(optarg); break; case 'D': vc.fn_dict = strdup(optarg); break; case 'F': vc.flag |= VC_FIX_PL; break; case 'N': vc.flag |= VC_ACGT_ONLY; break; @@ -338,31 +332,34 @@ int bcfview(int argc, char *argv[]) } if (argc == optind) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bcftools view [options] [reg]\n\n"); - fprintf(stderr, "Options: -c SNP calling\n"); - fprintf(stderr, " -v output potential variant sites only (force -c)\n"); - fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); - fprintf(stderr, " -b output BCF instead of VCF\n"); - fprintf(stderr, " -u uncompressed BCF output (force -b)\n"); - fprintf(stderr, " -S input is VCF\n"); - fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); - fprintf(stderr, " -G suppress all individual genotype information\n"); - fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); - fprintf(stderr, " -Q output the QCALL likelihood format\n"); - fprintf(stderr, " -L calculate LD for adjacent sites\n"); - fprintf(stderr, " -I skip indels\n"); - fprintf(stderr, " -F PL generated by r921 or before\n"); - fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n"); - fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n"); - fprintf(stderr, " -l FILE list of sites to output [all sites]\n"); - fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta); - fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); - fprintf(stderr, " -p FLOAT variant if P(ref|D) [reg]\n\n"); + fprintf(stderr, "Input/output options:\n\n"); + fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); + fprintf(stderr, " -b output BCF instead of VCF\n"); + fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n"); + fprintf(stderr, " -F PL generated by r921 or before (which generate old ordering)\n"); + fprintf(stderr, " -G suppress all individual genotype information\n"); + fprintf(stderr, " -l FILE list of sites (chr pos) or regions (BED) to output [all sites]\n"); + fprintf(stderr, " -L calculate LD for adjacent sites\n"); + fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); + fprintf(stderr, " -Q output the QCALL likelihood format\n"); + fprintf(stderr, " -s FILE list of samples to use [all samples]\n"); + fprintf(stderr, " -S input is VCF\n"); + fprintf(stderr, " -u uncompressed BCF output (force -b)\n"); + fprintf(stderr, "\nConsensus/variant calling options:\n\n"); + fprintf(stderr, " -c SNP calling\n"); + fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n"); + fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); + fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); + fprintf(stderr, " -I skip indels\n"); + fprintf(stderr, " -p FLOAT variant if P(ref|D) 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac } - if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, hin); if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) { void *str2id = bcf_build_refhash(hout); if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) { @@ -447,13 +443,7 @@ int bcfview(int argc, char *argv[]) x = toupper(b->ref[0]); if (x != 'A' && x != 'C' && x != 'G' && x != 'T') continue; } - if (hash) { - uint64_t x = (uint64_t)b->tid<<32 | b->pos; - khint_t k = kh_get(set64, hash, x); - if (kh_size(hash) == 0) break; - if (k == kh_end(hash)) continue; - kh_del(set64, hash, k); - } + if (vc.bed && !bed_overlap(vc.bed, hin->ns[b->tid], b->pos, b->pos + strlen(b->ref))) continue; if (tid >= 0) { int l = strlen(b->ref); l = b->pos + (l > 0? l : 1); @@ -492,7 +482,7 @@ int bcfview(int argc, char *argv[]) kstring_t s; s.m = s.l = 0; s.s = 0; if (*b->info) kputc(';', &s); - ksprintf(&s, "NEIR=%.3f;NEIF=%.3f,%.3f", r2, f[0]+f[2], f[0]+f[1]); + ksprintf(&s, "NEIR=%.3f;NEIF4=%.3f,%.3f,%.3f,%.3f", r2, f[0], f[1], f[2], f[3]); bcf_append_info(b, s.s, s.l); free(s.s); } @@ -512,8 +502,6 @@ int bcfview(int argc, char *argv[]) bcf_hdr_destroy(hin); bcf_destroy(b); bcf_destroy(blast); vcf_close(bp); vcf_close(bout); - if (hash) kh_destroy(set64, hash); - if (vc.fn_list) free(vc.fn_list); if (vc.fn_dict) free(vc.fn_dict); if (vc.ploidy) free(vc.ploidy); if (vc.n_sub) { @@ -521,6 +509,7 @@ int bcfview(int argc, char *argv[]) for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]); free(vc.subsam); free(vc.sublist); } + if (vc.bed) bed_destroy(vc.bed); if (seeds) free(seeds); if (p1) bcf_p1_destroy(p1); return 0; diff --git a/bcftools/ld.c b/bcftools/ld.c index 11474ed..0207819 100644 --- a/bcftools/ld.c +++ b/bcftools/ld.c @@ -98,3 +98,46 @@ double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]) } return r; } + +int bcf_main_pwld(int argc, char *argv[]) +{ + bcf_t *fp; + bcf_hdr_t *h; + bcf1_t **b, *b0; + int i, j, m, n; + double f[4]; + if (argc == 1) { + fprintf(stderr, "Usage: bcftools pwld \n"); + return 1; + } + fp = bcf_open(argv[1], "rb"); + h = bcf_hdr_read(fp); + // read the entire BCF + m = n = 0; b = 0; + b0 = calloc(1, sizeof(bcf1_t)); + while (bcf_read(fp, h, b0) >= 0) { + if (m == n) { + m = m? m<<1 : 16; + b = realloc(b, sizeof(void*) * m); + } + b[n] = calloc(1, sizeof(bcf1_t)); + bcf_cpy(b[n++], b0); + } + bcf_destroy(b0); + // compute pair-wise r^2 + printf("%d\n", n); // the number of loci + for (i = 0; i < n; ++i) { + printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1); + for (j = 0; j < i; ++j) { + double r = bcf_ld_freq(b[i], b[j], f); + printf("\t%.3f", r*r); + } + printf("\t1.000\n"); + } + // free + for (i = 0; i < n; ++i) bcf_destroy(b[i]); + free(b); + bcf_hdr_destroy(h); + bcf_close(fp); + return 0; +} diff --git a/bcftools/main.c b/bcftools/main.c index 7ffc2a0..75125f8 100644 --- a/bcftools/main.c +++ b/bcftools/main.c @@ -5,6 +5,7 @@ int bcfview(int argc, char *argv[]); int bcf_main_index(int argc, char *argv[]); +int bcf_main_pwld(int argc, char *argv[]); #define BUF_SIZE 0x10000 @@ -50,12 +51,14 @@ int main(int argc, char *argv[]) fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n"); fprintf(stderr, " index index BCF\n"); fprintf(stderr, " cat concatenate BCFs\n"); + fprintf(stderr, " ld compute all-pair r^2\n"); fprintf(stderr, "\n"); return 1; } if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1); - else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); + else if (strcmp(argv[1], "ld") == 0) return bcf_main_pwld(argc-1, argv+1); + else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); // cat is different ... else { fprintf(stderr, "[main] Unrecognized command.\n"); return 1; diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 503a998..a024d04 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -10,7 +10,7 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define MC_MAX_EM_ITER 16 -#define MC_EM_EPS 1e-4 +#define MC_EM_EPS 1e-5 #define MC_DEF_INDEL 0.15 unsigned char seq_nt4_table[256] = { @@ -209,21 +209,39 @@ static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) return i; } // f0 is the reference allele frequency -static double mc_freq_iter(double f0, const bcf_p1aux_t *ma) +static double mc_freq_iter(double f0, const bcf_p1aux_t *ma, int beg, int end) { double f, f3[3]; int i; f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; - for (i = 0, f = 0.; i < ma->n; ++i) { + for (i = beg, f = 0.; i < end; ++i) { double *pdg; pdg = ma->pdg + i * 3; f += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2]) / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]); } - f /= ma->n * 2.; + f /= (end - beg) * 2.; return f; } +static double mc_gtfreq_iter(double g[3], const bcf_p1aux_t *ma, int beg, int end) +{ + double err, gg[3]; + int i; + gg[0] = gg[1] = gg[2] = 0.; + for (i = beg; i < end; ++i) { + double *pdg, sum, tmp[3]; + pdg = ma->pdg + i * 3; + tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2]; + sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg); + gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum; + } + err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]); + err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]); + g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2]; + return err; +} + int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) { double sum, g[3]; @@ -448,6 +466,8 @@ static double contrast2(bcf_p1aux_t *p1, double ret[3]) for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1) for (k2 = 0; k2 <= 2*n2; ++k2) if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, ret)) >= 0) z += y; + if (ret[0] + ret[1] + ret[2] < 0.99) // It seems that this may be caused by floating point errors. I do not really understand why... + z = 1.0, ret[0] = ret[1] = ret[2] = 1./3; } return (double)z; } @@ -516,10 +536,27 @@ int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) { // calculate f_em double flast = rst->f_flat; for (i = 0; i < MC_MAX_EM_ITER; ++i) { - rst->f_em = mc_freq_iter(flast, ma); + rst->f_em = mc_freq_iter(flast, ma, 0, ma->n); if (fabs(rst->f_em - flast) < MC_EM_EPS) break; flast = rst->f_em; } + if (ma->n1 > 0 && ma->n1 < ma->n) { + for (k = 0; k < 2; ++k) { + flast = rst->f_em; + for (i = 0; i < MC_MAX_EM_ITER; ++i) { + rst->f_em2[k] = k? mc_freq_iter(flast, ma, ma->n1, ma->n) : mc_freq_iter(flast, ma, 0, ma->n1); + if (fabs(rst->f_em2[k] - flast) < MC_EM_EPS) break; + flast = rst->f_em2[k]; + } + } + } + } + { // compute g[3] + rst->g[0] = (1. - rst->f_em) * (1. - rst->f_em); + rst->g[1] = 2. * rst->f_em * (1. - rst->f_em); + rst->g[2] = rst->f_em * rst->f_em; + for (i = 0; i < MC_MAX_EM_ITER; ++i) + if (mc_gtfreq_iter(rst->g, ma, 0, ma->n) < MC_EM_EPS) break; } { // estimate equal-tail credible interval (95% level) int l, h; @@ -534,7 +571,6 @@ int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) h = i; rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M; } - rst->g[0] = rst->g[1] = rst->g[2] = -1.; rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0; if (rst->p_var > 0.1) // skip contrast2() if the locus is a strong non-variant rst->p_chi2 = contrast2(ma, rst->cmp); diff --git a/bcftools/prob1.h b/bcftools/prob1.h index 3f89dda..b824c3c 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -10,7 +10,7 @@ typedef struct { int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal() double f_em, f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var; double cil, cih; - double cmp[3], p_chi2; // used by contrast2() + double cmp[3], p_chi2, f_em2[2]; // used by contrast2() double g[3]; } bcf_p1rst_t; diff --git a/bedidx.c b/bedidx.c new file mode 100644 index 0000000..9297831 --- /dev/null +++ b/bedidx.c @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 8192) + +typedef struct { + int n, m; + uint64_t *a; + int *idx; +} bed_reglist_t; + +#include "khash.h" +KHASH_MAP_INIT_STR(reg, bed_reglist_t) + +#define LIDX_SHIFT 13 + +typedef kh_reg_t reghash_t; + +int *bed_index_core(int n, uint64_t *a, int *n_idx) +{ + int i, j, m, *idx; + m = *n_idx = 0; idx = 0; + for (i = 0; i < n; ++i) { + int beg, end; + beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; + if (m < end + 1) { + int oldm = m; + m = end + 1; + kroundup32(m); + idx = realloc(idx, m * sizeof(int)); + for (j = oldm; j < m; ++j) idx[j] = -1; + } + if (beg == end) { + if (idx[beg] < 0) idx[beg] = i; + } else { + for (j = beg; j <= end; ++j) + if (idx[j] < 0) idx[j] = i; + } + *n_idx = end + 1; + } + return idx; +} + +void bed_index(void *_h) +{ + reghash_t *h = (reghash_t*)_h; + khint_t k; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + bed_reglist_t *p = &kh_val(h, k); + if (p->idx) free(p->idx); + p->idx = bed_index_core(p->n, p->a, &p->m); + } + } +} + +int bed_overlap_core(const bed_reglist_t *p, int beg, int end) +{ + int i, min_off; + if (p->n == 0) return 0; + min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; + if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here + int n = beg>>LIDX_SHIFT; + if (n > p->n) n = p->n; + for (i = n - 1; i >= 0; --i) + if (p->idx[i] >= 0) break; + min_off = i >= 0? p->idx[i] : 0; + } + for (i = min_off; i < p->n; ++i) { + if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed + if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) + return 1; // find the overlap; return + } + return 0; +} + +int bed_overlap(const void *_h, const char *chr, int beg, int end) +{ + const reghash_t *h = (const reghash_t*)_h; + khint_t k; + if (!h) return 0; + k = kh_get(reg, h, chr); + if (k == kh_end(h)) return 0; + return bed_overlap_core(&kh_val(h, k), beg, end); +} + +void *bed_read(const char *fn) +{ + reghash_t *h = kh_init(reg); + gzFile fp; + kstream_t *ks; + int dret; + kstring_t *str; + // read the list + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) return 0; + str = calloc(1, sizeof(kstring_t)); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name + int beg = -1, end = -1; + bed_reglist_t *p; + khint_t k = kh_get(reg, h, str->s); + if (k == kh_end(h)) { // absent from the hash table + int ret; + char *s = strdup(str->s); + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(bed_reglist_t)); + } + p = &kh_val(h, k); + if (dret != '\n') { // if the lines has other characters + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + beg = atoi(str->s); // begin + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) + end = atoi(str->s); // end + } + } + } + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line + if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column + if (beg >= 0 && end > beg) { + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | end; + } + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + bed_index(h); + return h; +} + +void bed_destroy(void *_h) +{ + reghash_t *h = (reghash_t*)_h; + khint_t k; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free(kh_val(h, k).idx); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); +} diff --git a/bgzf.c b/bgzf.c index db970c8..216cd04 100644 --- a/bgzf.c +++ b/bgzf.c @@ -111,6 +111,32 @@ report_error(BGZF* fp, const char* message) { fp->error = message; } +int bgzf_check_bgzf(const char *fn) +{ + BGZF *fp; + uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377"; + int n; + + if ((fp = bgzf_open(fn, "r")) == 0) + { + fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn); + return -1; + } + +#ifdef _USE_KNETFILE + n = knet_read(fp->x.fpr, buf, 10); +#else + n = fread(buf, 1, 10, fp->file); +#endif + bgzf_close(fp); + + if ( n!=10 ) + return -1; + + if ( !memcmp(magic, buf, 10) ) return 1; + return 0; +} + static BGZF *bgzf_read_init() { BGZF *fp; diff --git a/bgzf.h b/bgzf.h index 9dc7b5f..7295f37 100644 --- a/bgzf.h +++ b/bgzf.h @@ -128,6 +128,7 @@ int bgzf_check_EOF(BGZF *fp); int bgzf_read_block(BGZF* fp); int bgzf_flush(BGZF* fp); int bgzf_flush_try(BGZF *fp, int size); +int bgzf_check_bgzf(const char *fn); #ifdef __cplusplus } diff --git a/sam.c b/sam.c index e195b0f..f026bc8 100644 --- a/sam.c +++ b/sam.c @@ -59,9 +59,9 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) append_header_text(fp->header, textheader->text, textheader->l_text); bam_header_destroy(textheader); } - if (fp->header->n_targets == 0) + if (fp->header->n_targets == 0 && bam_verbose >= 1) fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); - } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); + } else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); } } else if (strchr(mode, 'w')) { // write fp->header = bam_header_dup((const bam_header_t*)aux); @@ -93,10 +93,10 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) sam_header_parse(alt); alt->l_text = 0; alt->text = 0; // check if there are @SQ lines in the header - fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); + fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} - if (alt->n_targets != fp->header->n_targets) - fprintf(stderr, "[samopen] inconsistent number of target sequences.\n"); + if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1) + fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n"); } else { // then dump ->target_{name,len} for (i = 0; i < fp->header->n_targets; ++i) fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); @@ -168,7 +168,7 @@ char *samfaipath(const char *fn_ref) if (access(fn_ref, R_OK) == -1) { fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref); } else { - fprintf(stderr, "[samfaipath] build FASTA index...\n"); + if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n"); if (fai_build(fn_ref) == -1) { fprintf(stderr, "[samfaipath] fail to build FASTA index.\n"); free(fn_list); fn_list = 0; diff --git a/sam_header.c b/sam_header.c index 05d75de..f49b2d4 100644 --- a/sam_header.c +++ b/sam_header.c @@ -563,6 +563,7 @@ void *sam_header_parse2(const char *headerText) const char *text; char *buf=NULL; size_t nbuf = 0; + int tovalidate = 0; if ( !headerText ) return 0; @@ -571,7 +572,7 @@ void *sam_header_parse2(const char *headerText) while ( (text=nextline(&buf, &nbuf, text)) ) { hline = sam_header_line_parse(buf); - if ( hline && sam_header_line_validate(hline) ) + if ( hline && (!tovalidate || sam_header_line_validate(hline)) ) // With too many (~250,000) reference sequences the header parsing was too slow with list_append. hlines = list_append_to_end(hlines, hline); else diff --git a/sam_view.c b/sam_view.c index 170bd84..a579614 100644 --- a/sam_view.c +++ b/sam_view.c @@ -6,6 +6,7 @@ #include "sam_header.h" #include "sam.h" #include "faidx.h" +#include "kstring.h" #include "khash.h" KHASH_SET_INIT_STR(rg) @@ -18,10 +19,15 @@ typedef struct { typedef khash_t(rg) *rghash_t; -rghash_t g_rghash = 0; +static rghash_t g_rghash = 0; static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; static char *g_library, *g_rg; static int g_sol2sanger_tbl[128]; +static void *g_bed; + +void *bed_read(const char *fn); +void bed_destroy(void *_h); +int bed_overlap(const void *_h, const char *chr, int beg, int end); static void sol2sanger(bam1_t *b) { @@ -44,6 +50,8 @@ static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) { if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) return 1; + if (g_bed && b->core.tid >= 0 && !bed_overlap(g_bed, h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)))) + return 1; if (g_rg || g_rghash) { uint8_t *s = bam_aux_get(b, "RG"); if (s) { @@ -61,6 +69,37 @@ static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) return 0; } +static char *drop_rg(char *hdtxt, rghash_t h, int *len) +{ + char *p = hdtxt, *q, *r, *s; + kstring_t str; + memset(&str, 0, sizeof(kstring_t)); + while (1) { + int toprint = 0; + q = strchr(p, '\n'); + if (q == 0) q = p + strlen(p); + if (q - p < 3) break; // the line is too short; then stop + if (strncmp(p, "@RG\t", 4) == 0) { + int c; + khint_t k; + if ((r = strstr(p, "\tID:")) != 0) { + r += 4; + for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s); + c = *s; *s = '\0'; + k = kh_get(rg, h, r); + *s = c; + if (k != kh_end(h)) toprint = 1; + } + } else toprint = 1; + if (toprint) { + kputsn(p, q - p, &str); kputc('\n', &str); + } + p = q + 1; + } + *len = str.l; + return str.s; +} + // callback function for bam_fetch() that prints nonskipped records static int view_func(const bam1_t *b, void *data) { @@ -90,7 +129,7 @@ int main_samview(int argc, char *argv[]) /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:CR:")) >= 0) { + while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:CR:L:")) >= 0) { switch (c) { case 'c': is_count = 1; break; case 'C': slx2sngr = 1; break; @@ -106,6 +145,7 @@ int main_samview(int argc, char *argv[]) case 'u': compress_level = 0; break; case '1': compress_level = 1; break; case 'l': g_library = strdup(optarg); break; + case 'L': g_bed = bed_read(optarg); break; case 'r': g_rg = strdup(optarg); break; case 'R': fn_rg = strdup(optarg); break; case 'x': of_type = BAM_OFHEX; break; @@ -156,6 +196,14 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } + if (g_rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for... + char *tmp; + int l; + tmp = drop_rg(in->header->text, g_rghash, &l); + free(in->header->text); + in->header->text = tmp; + in->header->l_text = l; + } if (!is_count && (out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { fprintf(stderr, "[main_samview] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); ret = 1; @@ -215,6 +263,7 @@ view_end: } // close files, free and return free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg); + if (g_bed) bed_destroy(g_bed); if (g_rghash) { khint_t k; for (k = 0; k < kh_end(g_rghash); ++k) @@ -240,6 +289,7 @@ static int usage(int is_long_help) fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n"); fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n"); fprintf(stderr, " -c print only the count of matching records\n"); + fprintf(stderr, " -L FILE output alignments overlapping the input BED FILE [null]\n"); fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n"); fprintf(stderr, " -o FILE output file name [stdout]\n"); diff --git a/sample.c b/sample.c index b3d2642..3efc559 100644 --- a/sample.c +++ b/sample.c @@ -75,6 +75,7 @@ int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) ++n; } if (n == 0) add_pair(sm, sm2id, fn, fn); +// add_pair(sm, sm2id, fn, fn); free(buf.s); return 0; } -- 2.30.2