From fa217aa47313e2535cbd2d4bb034cfd405162662 Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Tue, 29 Mar 2011 12:52:27 +0900 Subject: [PATCH] Imported Upstream version 0.1.14 --- Makefile | 2 +- NEWS | 27 +- bam.h | 2 +- bam_cat.c | 184 ++++++++ bam_plcmd.c | 27 +- bam_sort.c | 13 +- bamtk.c | 3 + bcftools/bcf.h | 4 + bcftools/bcf.tex | 13 +- bcftools/bcftools.1 | 140 ++++-- bcftools/bcfutils.c | 96 ++++- bcftools/call1.c | 132 ++++-- bcftools/prob1.c | 284 ++++++++---- bcftools/prob1.h | 7 +- bgzf.c | 51 ++- bgzf.h | 3 +- misc/export2sam.pl | 1006 +++++++++++++++++++++++-------------------- sam.c | 20 +- sam_view.c | 16 +- samtools.1 | 36 +- 20 files changed, 1377 insertions(+), 689 deletions(-) create mode 100644 bam_cat.c diff --git a/Makefile b/Makefile index af93d9c..f025942 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_ KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \ - $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o + $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ diff --git a/NEWS b/NEWS index 8455b48..946ba7b 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,28 @@ +Beta release 0.1.14 (16 March, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release implements a method for testing associations for case-control +data. The method does not call genotypes but instead sums over all genotype +configurations to compute a chi^2 based test statistics. It can be potentially +applied to comparing a pair of samples (e.g. a tumor-normal pair), but this +has not been evaluated on real data. + +Another new feature is to make X chromosome variant calls when female and male +samples are both present. The user needs to provide a file indicating the +ploidy of each sample. + +Other new features: + + * Added `samtools mpileup -L' to skip INDEL calling in regions with + excessively high coverage. Such regions dramatically slow down mpileup. + + * Added `bcftools view -F' to parse BCF files generated by samtools r921 or + older which encode PL in a different way. + +(0.1.14: 16 March 2011, r930:163) + + + Beta release 0.1.13 (1 March, 2011) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -56,7 +81,7 @@ Other notable changes in bcftools: * Updated the BCF spec. * Added the `FQ' VCF INFO field, which gives the phred-scaled probability - of all samples being the samei (identical to the reference or all homozygous + of all samples being the same (identical to the reference or all homozygous variants). Option `view -f' has been dropped. * Implementated of "vcfutils.pl vcf2fq" to generate a consensus sequence diff --git a/bam.h b/bam.h index 4ad2644..bc8e3f1 100644 --- a/bam.h +++ b/bam.h @@ -40,7 +40,7 @@ @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.13 (r926:134)" +#define BAM_VERSION "0.1.14 (r933:170)" #include #include diff --git a/bam_cat.c b/bam_cat.c new file mode 100644 index 0000000..0fde045 --- /dev/null +++ b/bam_cat.c @@ -0,0 +1,184 @@ +/* + +bam_cat -- efficiently concatenates bam files + +bam_cat can be used to concatenate BAM files. Under special +circumstances, it can be used as an alternative to 'samtools merge' to +concatenate multiple sorted files into a single sorted file. For this +to work each file must be sorted, and the sorted files must be given +as command line arguments in order such that the final read in file i +is less than or equal to the first read in file i+1. + +This code is derived from the bam_reheader function in samtools 0.1.8 +and modified to perform concatenation by Chris Saunders on behalf of +Illumina. + + +########## License: + +The MIT License + +Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. +Modified SAMtools work copyright (c) 2010 Illumina, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +*/ + + +/* +makefile: +""" +CC=gcc +CFLAGS+=-g -Wall -O2 -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -I$(SAMTOOLS_DIR) +LDFLAGS+=-L$(SAMTOOLS_DIR) +LDLIBS+=-lbam -lz + +all:bam_cat +""" +*/ + + +#include +#include +#include + +#include "bgzf.h" +#include "bam.h" + +#define BUF_SIZE 0x10000 + +#define GZIPID1 31 +#define GZIPID2 139 + +#define BGZF_EMPTY_BLOCK_SIZE 28 + + +int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam) +{ + BGZF *fp; + FILE* fp_file; + uint8_t *buf; + uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; + const int es=BGZF_EMPTY_BLOCK_SIZE; + int i; + + fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w"); + if (fp == 0) { + fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam); + return 1; + } + if (h) bam_header_write(fp, h); + + buf = (uint8_t*) malloc(BUF_SIZE); + for(i = 0; i < nfn; ++i){ + BGZF *in; + bam_header_t *old; + int len,j; + + in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(fileno(stdin), "r"); + if (in == 0) { + fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); + return -1; + } + if (in->open_mode != 'r') return -1; + + old = bam_header_read(in); + if (h == 0 && i == 0) bam_header_write(fp, old); + + if (in->block_offset < in->block_length) { + bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); + bgzf_flush(fp); + } + + j=0; +#ifdef _USE_KNETFILE + fp_file=fp->x.fpw; + while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) { +#else + fp_file=fp->file; + while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) { +#endif + if(len= 0) { + switch (c) { + case 'h': { + tamFile fph = sam_open(optarg); + if (fph == 0) { + fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]); + return 1; + } + h = sam_header_read(fph); + sam_close(fph); + break; + } + case 'o': outfn = strdup(optarg); break; + } + } + if (argc - optind < 2) { + fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] [...]\n"); + return 1; + } + ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-"); + free(outfn); + return ret; +} diff --git a/bam_plcmd.c b/bam_plcmd.c index bba62cb..fd75cec 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -534,9 +534,10 @@ int bam_pileup(int argc, char *argv[]) #define MPLP_FMT_SP 0x200 #define MPLP_NO_INDEL 0x400 #define MPLP_EXT_BAQ 0x800 +#define MPLP_ILLUMINA13 0x1000 typedef struct { - int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth; + int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels char *reg, *fn_pos, *pl_list; @@ -575,6 +576,12 @@ static int mplp_func(void *data, bam1_t *b) skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0); if (skip) continue; } + if (ma->conf->flag & MPLP_ILLUMINA13) { + int i; + uint8_t *qual = bam1_qual(b); + for (i = 0; i < b->core.l_qseq; ++i) + qual[i] = qual[i] > 31? qual[i] - 31 : 0; + } has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0; skip = 0; if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_EXT_BAQ)? 3 : 1); @@ -617,7 +624,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; - int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid, max_depth; + int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; @@ -722,6 +729,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-sample depth to %d\n", __func__, max_depth); } + max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested @@ -737,8 +745,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) ref_tid = tid; } if (conf->flag & MPLP_GLF) { - int _ref0, ref16; + int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); + for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; @@ -750,7 +759,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bcf_write(bp, bh, b); bcf_destroy(b); // call indels - if (!(conf->flag&MPLP_NO_INDEL) && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { + if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { @@ -866,11 +875,11 @@ int bam_mpileup(int argc, char *argv[]) mplp.max_mq = 60; mplp.min_baseQ = 13; mplp.capQ_thres = 0; - mplp.max_depth = 250; + mplp.max_depth = 250; mplp.max_indel_depth = 250; mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN; - while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:b:P:o:e:h:Im:F:EG:")) >= 0) { + while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:6")) >= 0) { switch (c) { case 'f': mplp.fai = fai_load(optarg); @@ -889,6 +898,7 @@ int bam_mpileup(int argc, char *argv[]) case 'S': mplp.flag |= MPLP_FMT_SP; break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; case 'E': mplp.flag |= MPLP_EXT_BAQ; break; + case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'M': mplp.max_mq = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; @@ -900,6 +910,7 @@ int bam_mpileup(int argc, char *argv[]) case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; + case 'L': mplp.max_indel_depth = atoi(optarg); break; case 'G': { FILE *fp_rg; char buf[1024]; @@ -924,7 +935,8 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); fprintf(stderr, " -Q INT min base quality [%d]\n", mplp.min_baseQ); fprintf(stderr, " -q INT filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq); - fprintf(stderr, " -d INT max per-sample depth [%d]\n", mplp.max_depth); + fprintf(stderr, " -d INT max per-BAM depth [%d]\n", mplp.max_depth); + fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); @@ -932,6 +944,7 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac); fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); + fprintf(stderr, " -6 quality is in the Illumina-1.3+ encoding\n"); fprintf(stderr, " -A use anomalous read pairs in SNP/INDEL calling\n"); fprintf(stderr, " -g generate BCF output\n"); fprintf(stderr, " -u do not compress BCF output\n"); diff --git a/bam_sort.c b/bam_sort.c index 38f15d6..7aeccff 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -298,14 +298,19 @@ KSORT_INIT(sort, bam1_p, bam1_lt) static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout) { - char *name; + char *name, mode[3]; int i; bamFile fp; ks_mergesort(sort, k, buf, 0); name = (char*)calloc(strlen(prefix) + 20, 1); - if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n); - else sprintf(name, "%s.bam", prefix); - fp = is_stdout? bam_dopen(fileno(stdout), "w") : bam_open(name, "w"); + if (n >= 0) { + sprintf(name, "%s.%.4d.bam", prefix, n); + strcpy(mode, "w1"); + } else { + sprintf(name, "%s.bam", prefix); + strcpy(mode, "w"); + } + fp = is_stdout? bam_dopen(fileno(stdout), mode) : bam_open(name, mode); if (fp == 0) { fprintf(stderr, "[sort_blocks] fail to create file %s.\n", name); free(name); diff --git a/bamtk.c b/bamtk.c index 1d11519..5309d5e 100644 --- a/bamtk.c +++ b/bamtk.c @@ -25,6 +25,7 @@ int main_import(int argc, char *argv[]); int main_reheader(int argc, char *argv[]); int main_cut_target(int argc, char *argv[]); int main_phase(int argc, char *argv[]); +int main_cat(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); int glf3_view_main(int argc, char *argv[]); @@ -92,6 +93,7 @@ static int usage() fprintf(stderr, " merge merge sorted alignments\n"); fprintf(stderr, " rmdup remove PCR duplicates\n"); fprintf(stderr, " reheader replace BAM header\n"); + fprintf(stderr, " cat concatenate BAMs\n"); fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n"); fprintf(stderr, " phase phase heterozygotes\n"); fprintf(stderr, "\n"); @@ -131,6 +133,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1); + else if (strcmp(argv[1], "cat") == 0) return main_cat(argc-1, argv+1); else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1); else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1); #if _CURSES_LIB != 0 diff --git a/bcftools/bcf.h b/bcftools/bcf.h index f545a91..ba6dbe9 100644 --- a/bcftools/bcf.h +++ b/bcftools/bcf.h @@ -146,6 +146,10 @@ extern "C" { int bcf_is_indel(const bcf1_t *b); bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list); int bcf_subsam(int n_smpl, int *list, bcf1_t *b); + // move GT to the first FORMAT field + int bcf_fix_gt(bcf1_t *b); + // update PL generated by old samtools + int bcf_fix_pl(bcf1_t *b); // string hash table void *bcf_build_refhash(bcf_hdr_t *h); diff --git a/bcftools/bcf.tex b/bcftools/bcf.tex index 6f2171f..442fc2a 100644 --- a/bcftools/bcf.tex +++ b/bcftools/bcf.tex @@ -33,13 +33,14 @@ \end{center} \begin{center} -\begin{tabular}{cll} +\begin{tabular}{clp{9cm}} \hline \multicolumn{1}{l}{\bf Field} & \multicolumn{1}{l}{\bf Type} & \multicolumn{1}{l}{\bf Description} \\\hline {\tt DP} & {\tt uint16\_t[n]} & Read depth \\ {\tt GL} & {\tt float[n*G]} & Log10 likelihood of data; $G=\frac{A(A+1)}{2}$, $A=\#\{alleles\}$\\ {\tt GT} & {\tt uint8\_t[n]} & {\tt missing\char60\char60 7 | phased\char60\char60 6 | allele1\char60\char60 3 | allele2} \\ -{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$} \\ +{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$. If the highest bit is set, + the allele is not present (e.g. due to different ploidy between samples).} \\ {\tt GQ} & {\tt uint8\_t[n]} & {Genotype quality}\\ {\tt HQ} & {\tt uint8\_t[n*2]} & {Haplotype quality}\\ {\tt \_HQ} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic HQ}\\ @@ -67,10 +68,10 @@ BCF proposal). \item Predefined {\tt FORMAT} fields can be missing from VCF headers, but custom {\tt FORMAT} fields are required to be explicitly defined in the headers. -\item A {\tt FORMAT} field with its name starting with `{\tt \_}' gives an alternative - binary representation of the corresponding VCF field. The alternative representation - is used when the default representation is unable to keep the genotype information, - for example, when the ploidy is over 2 or there are more than 8 alleles. +\item A {\tt FORMAT} field with its name starting with `{\tt \_}' is specific to BCF only. + It gives an alternative binary representation of the corresponding VCF field, in case + the default representation is unable to keep the genotype information, + for example, when the ploidy is not 2 or there are more than 8 alleles. \end{itemize} \end{document} diff --git a/bcftools/bcftools.1 b/bcftools/bcftools.1 index 6c7403b..6d11e77 100644 --- a/bcftools/bcftools.1 +++ b/bcftools/bcftools.1 @@ -1,4 +1,4 @@ -.TH bcftools 1 "2 October 2010" "bcftools" "Bioinformatics tools" +.TH bcftools 1 "16 March 2011" "bcftools" "Bioinformatics tools" .SH NAME .PP bcftools - Utilities for the Binary Call Format (BCF) and VCF. @@ -20,53 +20,57 @@ estimating site allele frequencies and allele frequency spectrums. .TP 10 .B view .B bcftools view -.RB [ \-cbuSAGgHvNQ ] -.RB [ \-1 -.IR nGroup1 ] +.RB [ \-AbFGNQSucgv ] +.RB [ \-D +.IR seqDict ] .RB [ \-l -.IR listFile ] +.IR listLoci ] +.RB [ \-s +.IR listSample ] +.RB [ \-i +.IR gapSNPratio ] .RB [ \-t .IR mutRate ] .RB [ \-p .IR varThres ] .RB [ \-P .IR prior ] +.RB [ \-1 +.IR nGroup1 ] +.RB [ \-d +.IR minFrac ] +.RB [ \-U +.IR nPerm ] +.RB [ \-X +.IR permThres ] .I in.bcf .RI [ region ] Convert between BCF and VCF, call variant candidates and estimate allele frequencies. -.B OPTIONS: .RS +.TP +.B Input/Output Options: +.TP 10 +.B -A +Retain all possible alternate alleles at variant sites. By default, the view +command discards unlikely alleles. .TP 10 .B -b Output in the BCF format. The default is VCF. .TP -.B -c -Call variants. -.TP -.B -v -Output variant sites only (force -c) -.TP -.B -g -Call per-sample genotypes at variant sites (force -c) +.BI -D \ FILE +Sequence dictionary (list of chromosome names) for VCF->BCF conversion [null] .TP -.B -u -Uncompressed BCF output (force -b). -.TP -.B -S -The input is VCF instead of BCF. -.TP -.B -A -Retain all possible alternate alleles at variant sites. By default, this -command discards unlikely alleles. +.B -F +Indicate PL is generated by r921 or before (ordering is different). .TP .B -G Suppress all individual genotype information. .TP -.B -H -Perform Hardy-Weiberg Equilibrium test. This will add computation time, sometimes considerably. +.BI -l \ FILE +List of sites at which information are outputted [all sites] .TP .B -N Skip sites where the REF field is not A/C/G/T @@ -74,31 +78,70 @@ Skip sites where the REF field is not A/C/G/T .B -Q Output the QCALL likelihood format .TP -.B -f -Reference-free variant calling mode. In this mode, the prior will be -folded; a variant is called iff the sample(s) contains at least two -alleles; the QUAL field in the VCF/BCF output is changed accordingly. +.BI -s \ FILE +List of samples to use. The first column in the input gives the sample names +and the second gives the ploidy, which can only be 1 or 2. When the 2nd column +is absent, the sample ploidy is assumed to be 2. In the output, the ordering of +samples will be identical to the one in +.IR FILE . +[null] .TP -.BI "-1 " INT -Number of group-1 samples. This option is used for dividing input into -two groups for comparing. A zero value disables this functionality. [0] +.B -S +The input is VCF instead of BCF. .TP -.BI "-l " FILE -List of sites at which information are outputted [all sites] +.B -u +Uncompressed BCF output (force -b). .TP -.BI "-t " FLOAT -Scaled muttion rate for variant calling [0.001] +.B Consensus/Variant Calling Options: +.TP 10 +.B -c +Call variants. +.TP +.BI -d \ FLOAT +When +.B -v +is in use, skip loci where the fraction of samples covered by reads is below FLOAT. [0] +.TP +.B -g +Call per-sample genotypes at variant sites (force -c) .TP -.BI "-p " FLOAT +.BI -i \ FLOAT +Ratio of INDEL-to-SNP mutation rate [0.15] +.TP +.BI -p \ FLOAT A site is considered to be a variant if P(ref|D)n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; + // prepare + gi = b->gi + i; + PL = (uint8_t*)gi->data; + swap = alloca(gi->len); + // loop through individuals + for (i = 0; i < b->n_smpl; ++i) { + int k, l, x; + uint8_t *PLi = PL + i * gi->len; + memcpy(swap, PLi, gi->len); + for (k = x = 0; k < b->n_alleles; ++k) + for (l = k; l < b->n_alleles; ++l) + PLi[l*(l+1)/2 + k] = swap[x++]; + } + return 0; +} + +int bcf_smpl_covered(const bcf1_t *b) +{ + int i, j, n = 0; + uint32_t tmp; + bcf_ginfo_t *gi; + // pinpoint PL + tmp = bcf_str2int("PL", 2); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; + // count how many samples having PL!=[0..0] + gi = b->gi + i; + for (i = 0; i < b->n_smpl; ++i) { + uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len; + for (j = 0; j < gi->len; ++j) + if (PLi[j]) break; + if (j < gi->len) ++n; + } + return n; +} + static void *locate_field(const bcf1_t *b, const char *fmt, int l) { int i; @@ -188,6 +236,31 @@ int bcf_anno_max(bcf1_t *b) return 0; } +// FIXME: only data are shuffled; the header is NOT +int bcf_shuffle(bcf1_t *b, int seed) +{ + int i, j, *a; + if (seed > 0) srand48(seed); + a = malloc(b->n_smpl * sizeof(int)); + for (i = 0; i < b->n_smpl; ++i) a[i] = i; + for (i = b->n_smpl; i > 1; --i) { + int tmp; + j = (int)(drand48() * i); + tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; + } + for (j = 0; j < b->n_gi; ++j) { + bcf_ginfo_t *gi = b->gi + j; + uint8_t *swap, *data = (uint8_t*)gi->data; + swap = malloc(gi->len * b->n_smpl); + for (i = 0; i < b->n_smpl; ++i) + memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len); + free(gi->data); + gi->data = swap; + } + free(a); + return 0; +} + bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list) { int i, ret, j; @@ -197,12 +270,15 @@ bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int kstring_t s; s.l = s.m = 0; s.s = 0; hash = kh_init(str2id); - for (i = 0; i < n; ++i) - k = kh_put(str2id, hash, samples[i], &ret); - for (i = j = 0; i < h0->n_smpl; ++i) { - if (kh_get(str2id, hash, h0->sns[i]) != kh_end(hash)) { - list[j++] = i; - kputs(h0->sns[i], &s); kputc('\0', &s); + for (i = 0; i < h0->n_smpl; ++i) { + k = kh_put(str2id, hash, h0->sns[i], &ret); + kh_val(hash, k) = i; + } + for (i = j = 0; i < n; ++i) { + k = kh_get(str2id, hash, samples[i]); + if (k != kh_end(hash)) { + list[j++] = kh_val(hash, k); + kputs(samples[i], &s); kputc('\0', &s); } } if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); @@ -217,13 +293,17 @@ bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int return h; } -int bcf_subsam(int n_smpl, int *list, bcf1_t *b) // list MUST BE sorted +int bcf_subsam(int n_smpl, int *list, bcf1_t *b) { int i, j; for (j = 0; j < b->n_gi; ++j) { bcf_ginfo_t *gi = b->gi + j; + uint8_t *swap; + swap = malloc(gi->len * b->n_smpl); for (i = 0; i < n_smpl; ++i) - memcpy((uint8_t*)gi->data + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); + memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); + free(gi->data); + gi->data = swap; } b->n_smpl = n_smpl; return 0; diff --git a/bcftools/call1.c b/bcftools/call1.c index 286c014..e074fb5 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -6,6 +6,7 @@ #include "bcf.h" #include "prob1.h" #include "kstring.h" +#include "time.h" #include "khash.h" KHASH_SET_INIT_INT64(set64) @@ -19,7 +20,6 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define VC_VARONLY 16 #define VC_VCFIN 32 #define VC_UNCOMP 64 -#define VC_HWE 128 #define VC_KEEPALT 256 #define VC_ACGT_ONLY 512 #define VC_QCALL 1024 @@ -27,11 +27,13 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define VC_ADJLD 4096 #define VC_NO_INDEL 8192 #define VC_ANNO_MAX 16384 +#define VC_FIX_PL 32768 typedef struct { - int flag, prior_type, n1, n_sub, *sublist; + int flag, prior_type, n1, n_sub, *sublist, n_perm; char *fn_list, *prior_file, **subsam, *fn_dict; - double theta, pref, indel_frac; + uint8_t *ploidy; + double theta, pref, indel_frac, min_perm_p, min_smpl_frac; } viewconf_t; khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h) @@ -67,23 +69,6 @@ khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h) return hash; } -static double test_hwe(const double g[3]) -{ - extern double kf_gammaq(double p, double x); - double fexp, chi2, f[3], n; - int i; - n = g[0] + g[1] + g[2]; - fexp = (2. * g[2] + g[1]) / (2. * n); - if (fexp > 1. - 1e-10) fexp = 1. - 1e-10; - if (fexp < 1e-10) fexp = 1e-10; - f[0] = n * (1. - fexp) * (1. - fexp); - f[1] = n * 2. * fexp * (1. - fexp); - f[2] = n * fexp * fexp; - for (i = 0, chi2 = 0.; i < 3; ++i) - chi2 += (g[i] - f[i]) * (g[i] - f[i]) / f[i]; - return kf_gammaq(.5, chi2 / 2.); -} - typedef struct { double p[4]; int mq, depth, is_tested, d[4]; @@ -150,12 +135,11 @@ static void rm_info(bcf1_t *b, const char *key) static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag) { kstring_t s; - int is_var = (pr->p_ref < pref); - double p_hwe, r = is_var? pr->p_ref : pr->p_var, fq; + int has_I16, is_var = (pr->p_ref < pref); + double fq, r = is_var? pr->p_ref : pr->p_var; anno16_t a; - p_hwe = pr->g[0] >= 0.? test_hwe(pr->g) : 1.0; // only do HWE g[] is calculated - test16(b, &a); + has_I16 = test16(b, &a) >= 0? 1 : 0; rm_info(b, "I16="); memset(&s, 0, sizeof(kstring_t)); @@ -165,17 +149,24 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p if (b->info[0]) kputc(';', &s); // ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih); ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih); - ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded); if (fq < -999) fq = -999; if (fq > 999) fq = 999; ksprintf(&s, ";FQ=%.3g", fq); - if (a.is_tested) { - if (pr->pc[0] >= 0.) ksprintf(&s, ";PC4=%g,%g,%g,%g", pr->pc[0], pr->pc[1], pr->pc[2], pr->pc[3]); - ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); + if (pr->cmp[0] >= 0.) { + int i, q[3], pq; + for (i = 1; i < 3; ++i) { + double x = pr->cmp[i] + pr->cmp[0]/2.; + q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499); + if (q[i] > 255) q[i] = 255; + } + pq = (int)(-4.343 * log(pr->p_chi2) + .499); + if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank); + ksprintf(&s, ";QCHI2=%d;PCHI2=%.3g;PC2=%d,%d", pq, q[1], q[2], pr->p_chi2); +// ksprintf(&s, ",%g,%g,%g", pr->cmp[0], pr->cmp[1], pr->cmp[2]); } - if (pr->g[0] >= 0. && p_hwe <= .2) - ksprintf(&s, ";GC=%.2f,%.2f,%.2f;HWE=%.3f", pr->g[2], pr->g[1], pr->g[0], p_hwe); + if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s); free(b->str); @@ -214,11 +205,24 @@ static char **read_samples(const char *fn, int *_n) if (fp == 0) return 0; // fail to open file ks = ks_init(fp); while (ks_getuntil(ks, 0, &s, &dret) >= 0) { + int l; if (max == n) { max = max? max<<1 : 4; sam = realloc(sam, sizeof(void*)*max); } - sam[n++] = strdup(s.s); + l = s.l; + sam[n] = malloc(s.l + 2); + strcpy(sam[n], s.s); + sam[n][l+1] = 2; // by default, diploid + if (dret != '\n') { + if (ks_getuntil(ks, 0, &s, &dret) >= 0) { // read ploidy, 1 or 2 + int x = (int)s.s[0] - '0'; + if (x == 1 || x == 2) sam[n][l+1] = x; + else fprintf(stderr, "(%s) ploidy can only be 1 or 2; assume diploid\n", __func__); + } + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); + } + ++n; } ks_destroy(ks); gzclose(fp); @@ -239,7 +243,7 @@ static void write_header(bcf_hdr_t *h) if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); @@ -271,9 +283,10 @@ int bcfview(int argc, char *argv[]) extern void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x); extern int bcf_fix_gt(bcf1_t *b); extern int bcf_anno_max(bcf1_t *b); + extern int bcf_shuffle(bcf1_t *b, int seed); bcf_t *bp, *bout = 0; bcf1_t *b, *blast; - int c; + int c, *seeds = 0; uint64_t n_processed = 0; viewconf_t vc; bcf_p1aux_t *p1 = 0; @@ -284,13 +297,13 @@ int bcfview(int argc, char *argv[]) tid = begin = end = -1; memset(&vc, 0, sizeof(viewconf_t)); - vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; - vc.n_sub = 0; vc.subsam = 0; vc.sublist = 0; - while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:IMs:D:")) >= 0) { + vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; + while ((c = getopt(argc, argv, "FN1:l:cHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:")) >= 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; case 'l': vc.fn_list = strdup(optarg); break; case 'D': vc.fn_dict = strdup(optarg); break; + case 'F': vc.flag |= VC_FIX_PL; break; case 'N': vc.flag |= VC_ACGT_ONLY; break; case 'G': vc.flag |= VC_NO_GENO; break; case 'A': vc.flag |= VC_KEEPALT; break; @@ -299,7 +312,6 @@ int bcfview(int argc, char *argv[]) case 'c': vc.flag |= VC_CALL; break; case 'v': vc.flag |= VC_VARONLY | VC_CALL; break; case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break; - case 'H': vc.flag |= VC_HWE; break; case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break; case 'I': vc.flag |= VC_NO_INDEL; break; case 'M': vc.flag |= VC_ANNO_MAX; break; @@ -308,7 +320,14 @@ int bcfview(int argc, char *argv[]) case 'i': vc.indel_frac = atof(optarg); break; case 'Q': vc.flag |= VC_QCALL; break; case 'L': vc.flag |= VC_ADJLD; break; - case 's': vc.subsam = read_samples(optarg, &vc.n_sub); break; + case 'U': vc.n_perm = atoi(optarg); break; + case 'X': vc.min_perm_p = atof(optarg); break; + case 'd': vc.min_smpl_frac = atof(optarg); break; + case 's': vc.subsam = read_samples(optarg, &vc.n_sub); + vc.ploidy = calloc(vc.n_sub + 1, 1); + for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1]; + tid = -1; + break; case 'P': if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL; else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2; @@ -328,19 +347,22 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, " -S input is VCF\n"); fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); fprintf(stderr, " -G suppress all individual genotype information\n"); - fprintf(stderr, " -H perform Hardy-Weinberg test (slower)\n"); fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); fprintf(stderr, " -Q output the QCALL likelihood format\n"); fprintf(stderr, " -L calculate LD for adjacent sites\n"); fprintf(stderr, " -I skip indels\n"); + fprintf(stderr, " -F PL generated by r921 or before\n"); + fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n"); fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n"); - fprintf(stderr, " -1 INT number of group-1 samples [0]\n"); fprintf(stderr, " -l FILE list of sites to output [all sites]\n"); fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta); fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); fprintf(stderr, " -p FLOAT variant if P(ref|D)BCF conversion please specify the sequence dictionary with -D\n", __func__); return 1; } + if (vc.n1 <= 0) vc.n_perm = 0; // TODO: give a warning here! + if (vc.n_perm > 0) { + seeds = malloc(vc.n_perm * sizeof(int)); + srand48(time(0)); + for (c = 0; c < vc.n_perm; ++c) seeds[c] = lrand48(); + } b = calloc(1, sizeof(bcf1_t)); blast = calloc(1, sizeof(bcf1_t)); strcpy(moder, "r"); @@ -370,7 +398,7 @@ int bcfview(int argc, char *argv[]) vcf_hdr_write(bout, hout); } if (vc.flag & VC_CALL) { - p1 = bcf_p1_init(hout->n_smpl); + p1 = bcf_p1_init(hout->n_smpl, vc.ploidy); if (vc.prior_file) { if (bcf_p1_read_prior(p1, vc.prior_file) < 0) { fprintf(stderr, "[%s] fail to read the prior AFS.\n", __func__); @@ -403,7 +431,14 @@ int bcfview(int argc, char *argv[]) } while (vcf_read(bp, hin, b) > 0) { int is_indel; + if ((vc.flag & VC_VARONLY) && strcmp(b->alt, "X") == 0) continue; + if ((vc.flag & VC_VARONLY) && vc.min_smpl_frac > 0.) { + extern int bcf_smpl_covered(const bcf1_t *b); + int n = bcf_smpl_covered(b); + if ((double)n / b->n_smpl < vc.min_smpl_frac) continue; + } if (vc.n_sub) bcf_subsam(vc.n_sub, vc.sublist, b); + if (vc.flag & VC_FIX_PL) bcf_fix_pl(b); is_indel = bcf_is_indel(b); if ((vc.flag & VC_NO_INDEL) && is_indel) continue; if ((vc.flag & VC_ACGT_ONLY) && !is_indel) { @@ -434,12 +469,21 @@ int bcfview(int argc, char *argv[]) if (vc.flag & VC_CALL) { // call variants bcf_p1rst_t pr; bcf_p1_cal(b, p1, &pr); // pr.g[3] is not calculated here - if (vc.flag&VC_HWE) bcf_p1_cal_g3(p1, pr.g); if (n_processed % 100000 == 0) { fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed); bcf_p1_dump_afs(p1); } if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue; + if (vc.n_perm && vc.n1 > 0 && pr.p_chi2 < vc.min_perm_p) { // permutation test + bcf_p1rst_t r; + int i, n = 0; + for (i = 0; i < vc.n_perm; ++i) { + bcf_shuffle(b, seeds[i]); + bcf_p1_cal(b, p1, &r); + if (pr.p_chi2 >= r.p_chi2) ++n; + } + pr.perm_rank = n; + } update_bcf1(hout->n_smpl, b, p1, &pr, vc.pref, vc.flag); } if (vc.flag & VC_ADJLD) { // compute LD @@ -471,11 +515,13 @@ int bcfview(int argc, char *argv[]) if (hash) kh_destroy(set64, hash); if (vc.fn_list) free(vc.fn_list); if (vc.fn_dict) free(vc.fn_dict); + if (vc.ploidy) free(vc.ploidy); if (vc.n_sub) { int i; for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]); free(vc.subsam); free(vc.sublist); } + if (seeds) free(seeds); if (p1) bcf_p1_destroy(p1); return 0; } diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 193c4a0..503a998 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "prob1.h" #include "kseq.h" @@ -33,10 +34,12 @@ unsigned char seq_nt4_table[256] = { struct __bcf_p1aux_t { int n, M, n1, is_indel; + uint8_t *ploidy; // haploid or diploid ONLY double *q2p, *pdg; // pdg -> P(D|g) double *phi, *phi_indel; double *z, *zswap; // aux for afs double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set + double **hg; // hypergeometric distribution double t, t1, t2; double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution const uint8_t *PL; // point to PL @@ -123,25 +126,34 @@ int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn) return 0; } -bcf_p1aux_t *bcf_p1_init(int n) +bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy) { bcf_p1aux_t *ma; int i; ma = calloc(1, sizeof(bcf_p1aux_t)); ma->n1 = -1; ma->n = n; ma->M = 2 * n; + if (ploidy) { + ma->ploidy = malloc(n); + memcpy(ma->ploidy, ploidy, n); + for (i = 0, ma->M = 0; i < n; ++i) ma->M += ploidy[i]; + if (ma->M == 2 * n) { + free(ma->ploidy); + ma->ploidy = 0; + } + } ma->q2p = calloc(256, sizeof(double)); ma->pdg = calloc(3 * ma->n, sizeof(double)); ma->phi = calloc(ma->M + 1, sizeof(double)); ma->phi_indel = calloc(ma->M + 1, sizeof(double)); ma->phi1 = calloc(ma->M + 1, sizeof(double)); ma->phi2 = calloc(ma->M + 1, sizeof(double)); - ma->z = calloc(2 * ma->n + 1, sizeof(double)); - ma->zswap = calloc(2 * ma->n + 1, sizeof(double)); + ma->z = calloc(ma->M + 1, sizeof(double)); + ma->zswap = calloc(ma->M + 1, sizeof(double)); ma->z1 = calloc(ma->M + 1, sizeof(double)); // actually we do not need this large ma->z2 = calloc(ma->M + 1, sizeof(double)); - ma->afs = calloc(2 * ma->n + 1, sizeof(double)); - ma->afs1 = calloc(2 * ma->n + 1, sizeof(double)); + ma->afs = calloc(ma->M + 1, sizeof(double)); + ma->afs1 = calloc(ma->M + 1, sizeof(double)); for (i = 0; i < 256; ++i) ma->q2p[i] = pow(10., -i / 10.); bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior @@ -151,6 +163,10 @@ bcf_p1aux_t *bcf_p1_init(int n) int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) { if (n1 == 0 || n1 >= b->n) return -1; + if (b->M != b->n * 2) { + fprintf(stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__); + return -1; + } b->n1 = n1; return 0; } @@ -158,7 +174,12 @@ int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) void bcf_p1_destroy(bcf_p1aux_t *ma) { if (ma) { - free(ma->q2p); free(ma->pdg); + int k; + if (ma->hg && ma->n1 > 0) { + for (k = 0; k <= 2*ma->n1; ++k) free(ma->hg[k]); + free(ma->hg); + } + free(ma->ploidy); free(ma->q2p); free(ma->pdg); free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2); free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2); free(ma->afs); free(ma->afs1); @@ -207,8 +228,13 @@ int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) { double sum, g[3]; double max, f3[3], *pdg = ma->pdg + k * 3; - int q, i, max_i; - f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + int q, i, max_i, ploidy; + ploidy = ma->ploidy? ma->ploidy[k] : 2; + if (ploidy == 2) { + f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + } else { + f3[0] = 1. - f0; f3[1] = 0; f3[2] = f0; + } for (i = 0, sum = 0.; i < 3; ++i) sum += (g[i] = pdg[i] * f3[i]); for (i = 0, max = -1., max_i = 0; i < 3; ++i) { @@ -228,6 +254,7 @@ static void mc_cal_y_core(bcf_p1aux_t *ma, int beg) { double *z[2], *tmp, *pdg; int _j, last_min, last_max; + assert(beg == 0 || ma->M == ma->n*2); z[0] = ma->z; z[1] = ma->zswap; pdg = ma->pdg; @@ -236,41 +263,81 @@ static void mc_cal_y_core(bcf_p1aux_t *ma, int beg) z[0][0] = 1.; last_min = last_max = 0; ma->t = 0.; - for (_j = beg; _j < ma->n; ++_j) { - int k, j = _j - beg, _min = last_min, _max = last_max; - double p[3], sum; - pdg = ma->pdg + _j * 3; - p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2]; - for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; - for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; - _max += 2; - if (_min == 0) - k = 0, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k]; - if (_min <= 1) - k = 1, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] + k*(2*j+2-k) * p[1] * z[0][k-1]; - for (k = _min < 2? 2 : _min; k <= _max; ++k) - z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] - + k*(2*j+2-k) * p[1] * z[0][k-1] - + k*(k-1)* p[2] * z[0][k-2]; - for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; - ma->t += log(sum / ((2. * j + 2) * (2. * j + 1))); - for (k = _min; k <= _max; ++k) z[1][k] /= sum; - if (_min >= 1) z[1][_min-1] = 0.; - if (_min >= 2) z[1][_min-2] = 0.; - if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; - if (_j == ma->n1 - 1) { // set pop1 - ma->t1 = ma->t; - memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1)); + if (ma->M == ma->n * 2) { + int M = 0; + for (_j = beg; _j < ma->n; ++_j) { + int k, j = _j - beg, _min = last_min, _max = last_max, M0; + double p[3], sum; + M0 = M; M += 2; + pdg = ma->pdg + _j * 3; + p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2]; + for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; + for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; + _max += 2; + if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k]; + if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1]; + for (k = _min < 2? 2 : _min; k <= _max; ++k) + z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2]; + for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; + ma->t += log(sum / (M * (M - 1.))); + for (k = _min; k <= _max; ++k) z[1][k] /= sum; + if (_min >= 1) z[1][_min-1] = 0.; + if (_min >= 2) z[1][_min-2] = 0.; + if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; + if (_j == ma->n1 - 1) { // set pop1; ma->n1==-1 when unset + ma->t1 = ma->t; + memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1)); + } + tmp = z[0]; z[0] = z[1]; z[1] = tmp; + last_min = _min; last_max = _max; + } + //for (_j = 0; _j < last_min; ++_j) z[0][_j] = 0.; // TODO: are these necessary? + //for (_j = last_max + 1; _j < ma->M; ++_j) z[0][_j] = 0.; + } else { // this block is very similar to the block above; these two might be merged in future + int j, M = 0; + for (j = 0; j < ma->n; ++j) { + int k, M0, _min = last_min, _max = last_max; + double p[3], sum; + pdg = ma->pdg + j * 3; + for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; + for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; + M0 = M; + M += ma->ploidy[j]; + if (ma->ploidy[j] == 1) { + p[0] = pdg[0]; p[1] = pdg[2]; + _max++; + if (_min == 0) k = 0, z[1][k] = (M0+1-k) * p[0] * z[0][k]; + for (k = _min < 1? 1 : _min; k <= _max; ++k) + z[1][k] = (M0+1-k) * p[0] * z[0][k] + k * p[1] * z[0][k-1]; + for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; + ma->t += log(sum / M); + for (k = _min; k <= _max; ++k) z[1][k] /= sum; + if (_min >= 1) z[1][_min-1] = 0.; + if (j < ma->n - 1) z[1][_max+1] = 0.; + } else if (ma->ploidy[j] == 2) { + p[0] = pdg[0]; p[1] = 2 * pdg[1]; p[2] = pdg[2]; + _max += 2; + if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k]; + if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1]; + for (k = _min < 2? 2 : _min; k <= _max; ++k) + z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2]; + for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; + ma->t += log(sum / (M * (M - 1.))); + for (k = _min; k <= _max; ++k) z[1][k] /= sum; + if (_min >= 1) z[1][_min-1] = 0.; + if (_min >= 2) z[1][_min-2] = 0.; + if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; + } + tmp = z[0]; z[0] = z[1]; z[1] = tmp; + last_min = _min; last_max = _max; } - tmp = z[0]; z[0] = z[1]; z[1] = tmp; - last_min = _min; last_max = _max; } if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1)); } static void mc_cal_y(bcf_p1aux_t *ma) { - if (ma->n1 > 0 && ma->n1 < ma->n) { + if (ma->n1 > 0 && ma->n1 < ma->n && ma->M == ma->n * 2) { // NB: ma->n1 is ineffective when there are haploid samples int k; long double x; memset(ma->z1, 0, sizeof(double) * (2 * ma->n1 + 1)); @@ -286,26 +353,104 @@ static void mc_cal_y(bcf_p1aux_t *ma) } else mc_cal_y_core(ma, 0); } -static void contrast(bcf_p1aux_t *ma, double pc[4]) // mc_cal_y() must be called before hand +#define CONTRAST_TINY 1e-30 + +extern double kf_gammaq(double s, double z); // incomplete gamma function for chi^2 test + +static inline double chi2_test(int a, int b, int c, int d) +{ + double x, z; + x = (double)(a+b) * (c+d) * (b+d) * (a+c); + if (x == 0.) return 1; + z = a * d - b * c; + return kf_gammaq(.5, .5 * z * z * (a+b+c+d) / x); +} + +// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)] +static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int n1, int n2, int k1, int k2, double x[3]) +{ + double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2]; + if (p < CONTRAST_TINY) return -1; + if (.5*k1/n1 < .5*k2/n2) x[1] += p; + else if (.5*k1/n1 > .5*k2/n2) x[2] += p; + else x[0] += p; + return p * chi2_test(k1, k2, (n1<<1) - k1, (n2<<1) - k2); +} + +static double contrast2(bcf_p1aux_t *p1, double ret[3]) { - int k, n1 = ma->n1, n2 = ma->n - ma->n1; - long double sum1, sum2; - pc[0] = pc[1] = pc[2] = pc[3] = -1.; - if (n1 <= 0 || n2 <= 0) return; - for (k = 0, sum1 = 0.; k <= 2*n1; ++k) sum1 += ma->phi1[k] * ma->z1[k]; - for (k = 0, sum2 = 0.; k <= 2*n2; ++k) sum2 += ma->phi2[k] * ma->z2[k]; - pc[2] = ma->phi1[2*n1] * ma->z1[2*n1] / sum1; - pc[3] = ma->phi2[2*n2] * ma->z2[2*n2] / sum2; - for (k = 2; k < 4; ++k) { - pc[k] = pc[k] > .5? -(-4.343 * log(1. - pc[k] + TINY) + .499) : -4.343 * log(pc[k] + TINY) + .499; - pc[k] = (int)pc[k]; - if (pc[k] > 99) pc[k] = 99; - if (pc[k] < -99) pc[k] = -99; + int k, k1, k2, k10, k20, n1, n2; + double sum; + // get n1 and n2 + n1 = p1->n1; n2 = p1->n - p1->n1; + if (n1 <= 0 || n2 <= 0) return 0.; + if (p1->hg == 0) { // initialize the hypergeometric distribution + /* NB: the hg matrix may take a lot of memory when there are many samples. There is a way + to avoid precomputing this matrix, but it is slower and quite intricate. The following + computation in this block can be accelerated with a similar strategy, but perhaps this + is not a serious concern for now. */ + double tmp = lgamma(2*(n1+n2)+1) - (lgamma(2*n1+1) + lgamma(2*n2+1)); + p1->hg = calloc(2*n1+1, sizeof(void*)); + for (k1 = 0; k1 <= 2*n1; ++k1) { + p1->hg[k1] = calloc(2*n2+1, sizeof(double)); + for (k2 = 0; k2 <= 2*n2; ++k2) + p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp)); + } + } + { // compute sum1 and sum2 + long double suml = 0; + for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k]; + sum = suml; + } + { // get the mean k1 and k2 + double max; + int max_k; + for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) { + double x = p1->phi1[k] * p1->z1[k]; + if (x > max) max = x, max_k = k; + } + k10 = max_k; + for (k = 0, max = 0, max_k = -1; k <= 2*n2; ++k) { + double x = p1->phi2[k] * p1->z2[k]; + if (x > max) max = x, max_k = k; + } + k20 = max_k; + } + { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N. + double x[3], y; + long double z = 0.; + x[0] = x[1] = x[2] = 0; + for (k1 = k10; k1 >= 0; --k1) { + for (k2 = k20; k2 >= 0; --k2) { + if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break; + else z += y; + } + for (k2 = k20 + 1; k2 <= 2*n2; ++k2) { + if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break; + else z += y; + } + } + ret[0] = x[0]; ret[1] = x[1]; ret[2] = x[2]; + x[0] = x[1] = x[2] = 0; + for (k1 = k10 + 1; k1 <= 2*n1; ++k1) { + for (k2 = k20; k2 >= 0; --k2) { + if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break; + else z += y; + } + for (k2 = k20 + 1; k2 <= 2*n2; ++k2) { + if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break; + else z += y; + } + } + ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2]; + if (ret[0] + ret[1] + ret[2] < 0.99) { // in case of bad things happened + ret[0] = ret[1] = ret[2] = 0; + for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1) + for (k2 = 0; k2 <= 2*n2; ++k2) + if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, ret)) >= 0) z += y; + } + return (double)z; } - pc[0] = ma->phi2[2*n2] * ma->z2[2*n2] / sum2 * (1. - ma->phi1[2*n1] * ma->z1[2*n1] / sum1); - pc[1] = ma->phi1[2*n1] * ma->z1[2*n1] / sum1 * (1. - ma->phi2[2*n2] * ma->z2[2*n2] / sum2); - pc[0] = pc[0] == 1.? 99 : (int)(-4.343 * log(1. - pc[0]) + .499); - pc[1] = pc[1] == 1.? 99 : (int)(-4.343 * log(1. - pc[1]) + .499); } static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded) @@ -337,37 +482,12 @@ static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_fo return sum / ma->M; } -long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3]) -{ - long double pd = 0., g2[3]; - int i, k; - memset(g2, 0, sizeof(long double) * 3); - for (k = 0; k < p1a->M; ++k) { - double f = (double)k / p1a->M, f3[3], g1[3]; - long double z = 1.; - g1[0] = g1[1] = g1[2] = 0.; - f3[0] = (1. - f) * (1. - f); f3[1] = 2. * f * (1. - f); f3[2] = f * f; - for (i = 0; i < p1a->n; ++i) { - double *pdg = p1a->pdg + i * 3; - double x = pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]; - z *= x; - g1[0] += pdg[0] * f3[0] / x; - g1[1] += pdg[1] * f3[1] / x; - g1[2] += pdg[2] * f3[2] / x; - } - pd += p1a->phi[k] * z; - for (i = 0; i < 3; ++i) - g2[i] += p1a->phi[k] * z * g1[i]; - } - for (i = 0; i < 3; ++i) g[i] = g2[i] / pd; - return pd; -} - int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) { int i, k; long double sum = 0.; ma->is_indel = bcf_is_indel(b); + rst->perm_rank = -1; // set PL and PL_len for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("PL", 2)) { @@ -415,7 +535,9 @@ int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M; } rst->g[0] = rst->g[1] = rst->g[2] = -1.; - contrast(ma, rst->pc); + rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0; + if (rst->p_var > 0.1) // skip contrast2() if the locus is a strong non-variant + rst->p_chi2 = contrast2(ma, rst->cmp); return 0; } diff --git a/bcftools/prob1.h b/bcftools/prob1.h index b4f6a30..3f89dda 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -7,10 +7,10 @@ struct __bcf_p1aux_t; typedef struct __bcf_p1aux_t bcf_p1aux_t; typedef struct { - int rank0; + int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal() double f_em, f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var; double cil, cih; - double pc[4]; + double cmp[3], p_chi2; // used by contrast2() double g[3]; } bcf_p1rst_t; @@ -22,7 +22,7 @@ typedef struct { extern "C" { #endif - bcf_p1aux_t *bcf_p1_init(int n); + bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy); void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_destroy(bcf_p1aux_t *ma); @@ -30,7 +30,6 @@ extern "C" { int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); void bcf_p1_dump_afs(bcf_p1aux_t *ma); int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); - long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3]); int bcf_p1_set_n1(bcf_p1aux_t *b, int n1); void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called diff --git a/bgzf.c b/bgzf.c index 66d6b02..db970c8 100644 --- a/bgzf.c +++ b/bgzf.c @@ -148,7 +148,7 @@ open_read(int fd) static BGZF* -open_write(int fd, bool is_uncompressed) +open_write(int fd, int compress_level) // compress_level==-1 for the default level { FILE* file = fdopen(fd, "w"); BGZF* fp; @@ -156,7 +156,9 @@ open_write(int fd, bool is_uncompressed) fp = malloc(sizeof(BGZF)); fp->file_descriptor = fd; fp->open_mode = 'w'; - fp->owned_file = 0; fp->is_uncompressed = is_uncompressed; + fp->owned_file = 0; + fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 + if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; #ifdef _USE_KNETFILE fp->x.fpw = file; #else @@ -195,13 +197,20 @@ bgzf_open(const char* __restrict path, const char* __restrict mode) fp = open_read(fd); #endif } else if (strchr(mode, 'w') || strchr(mode, 'W')) { - int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC; + int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC; #ifdef _WIN32 oflag |= O_BINARY; #endif fd = open(path, oflag, 0666); if (fd == -1) return 0; - fp = open_write(fd, strchr(mode, 'u')? 1 : 0); + { // set compress_level + int i; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + } + fp = open_write(fd, compress_level); } if (fp != NULL) fp->owned_file = 1; return fp; @@ -214,7 +223,12 @@ bgzf_fdopen(int fd, const char * __restrict mode) if (mode[0] == 'r' || mode[0] == 'R') { return open_read(fd); } else if (mode[0] == 'w' || mode[0] == 'W') { - return open_write(fd, strstr(mode, "u")? 1 : 0); + int i, compress_level = -1; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + return open_write(fd, compress_level); } else { return NULL; } @@ -254,7 +268,6 @@ deflate_block(BGZF* fp, int block_length) int input_length = block_length; int compressed_length = 0; while (1) { - int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION; z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; @@ -263,7 +276,7 @@ deflate_block(BGZF* fp, int block_length) zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - int status = deflateInit2(&zs, compress_level, Z_DEFLATED, + int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); if (status != Z_OK) { report_error(fp, "deflate init failed"); @@ -330,6 +343,7 @@ inflate_block(BGZF* fp, int block_length) // Inflate the block in fp->compressed_block into fp->uncompressed_block z_stream zs; + int status; zs.zalloc = NULL; zs.zfree = NULL; zs.next_in = fp->compressed_block + 18; @@ -337,7 +351,7 @@ inflate_block(BGZF* fp, int block_length) zs.next_out = fp->uncompressed_block; zs.avail_out = fp->uncompressed_block_size; - int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + status = inflateInit2(&zs, GZIP_WINDOW_BITS); if (status != Z_OK) { report_error(fp, "inflate init failed"); return -1; @@ -431,7 +445,7 @@ int bgzf_read_block(BGZF* fp) { bgzf_byte_t header[BLOCK_HEADER_LENGTH]; - int count, size = 0; + int count, size = 0, block_length, remaining; #ifdef _USE_KNETFILE int64_t block_address = knet_tell(fp->x.fpr); if (load_block_from_cache(fp, block_address)) return 0; @@ -454,10 +468,10 @@ bgzf_read_block(BGZF* fp) report_error(fp, "invalid block header"); return -1; } - int block_length = unpackInt16((uint8_t*)&header[16]) + 1; + block_length = unpackInt16((uint8_t*)&header[16]) + 1; bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block; memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); - int remaining = block_length - BLOCK_HEADER_LENGTH; + remaining = block_length - BLOCK_HEADER_LENGTH; #ifdef _USE_KNETFILE count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); #else @@ -494,7 +508,8 @@ bgzf_read(BGZF* fp, void* data, int length) int bytes_read = 0; bgzf_byte_t* output = data; while (bytes_read < length) { - int available = fp->block_length - fp->block_offset; + int copy_length, available = fp->block_length - fp->block_offset; + bgzf_byte_t *buffer; if (available <= 0) { if (bgzf_read_block(fp) != 0) { return -1; @@ -504,8 +519,8 @@ bgzf_read(BGZF* fp, void* data, int length) break; } } - int copy_length = bgzf_min(length-bytes_read, available); - bgzf_byte_t* buffer = fp->uncompressed_block; + copy_length = bgzf_min(length-bytes_read, available); + buffer = fp->uncompressed_block; memcpy(output, buffer + fp->block_offset, copy_length); fp->block_offset += copy_length; output += copy_length; @@ -552,6 +567,8 @@ int bgzf_flush_try(BGZF *fp, int size) int bgzf_write(BGZF* fp, const void* data, int length) { + const bgzf_byte_t *input = data; + int block_length, bytes_written; if (fp->open_mode != 'w') { report_error(fp, "file not open for writing"); return -1; @@ -560,9 +577,9 @@ int bgzf_write(BGZF* fp, const void* data, int length) if (fp->uncompressed_block == NULL) fp->uncompressed_block = malloc(fp->uncompressed_block_size); - const bgzf_byte_t* input = data; - int block_length = fp->uncompressed_block_size; - int bytes_written = 0; + input = data; + block_length = fp->uncompressed_block_size; + bytes_written = 0; while (bytes_written < length) { int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written); bgzf_byte_t* buffer = fp->uncompressed_block; diff --git a/bgzf.h b/bgzf.h index 099ae9a..9dc7b5f 100644 --- a/bgzf.h +++ b/bgzf.h @@ -26,7 +26,6 @@ #include #include -#include #include #ifdef _USE_KNETFILE #include "knetfile.h" @@ -37,7 +36,7 @@ typedef struct { int file_descriptor; char open_mode; // 'r' or 'w' - bool owned_file, is_uncompressed; + int16_t owned_file, compress_level; #ifdef _USE_KNETFILE union { knetFile *fpr; diff --git a/misc/export2sam.pl b/misc/export2sam.pl index a2a436c..ec6dacf 100755 --- a/misc/export2sam.pl +++ b/misc/export2sam.pl @@ -1,461 +1,545 @@ -#!/usr/bin/env perl -# -# -# Script to convert GERALD export files to SAM format. -# -# -# -########## License: -# -# The MIT License -# -# Original SAMtools version 0.1.2 copyright (c) 2008-2009 Genome Research Ltd. -# Modifications from version 0.1.2 to 2.0.0 copyright (c) 2010 Illumina, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -# -# -# -########## ChangeLog: -# -# Version: 2.0.0 (15FEB2010) -# Script updated by Illumina in conjunction with CASAVA 1.7.0 release. -# Major changes are as follows: -# - The CIGAR string has been updated to include all gaps from ELANDv2 alignments. -# - The ELAND single read alignment score is always stored in the optional "SM" field -# and the ELAND paired read alignment score is stored in the optional "AS" field -# when it exists. -# - The MAPQ value is set to the higher of the two alignment scores, but no greater -# than 254, i.e. min(254,max(SM,AS)) -# - The SAM "proper pair" bit (0x0002) is now set for read pairs meeting ELAND's -# expected orientation and insert size criteria. -# - The default quality score translation is set for export files which contain -# Phread+64 quality values. An option, "--qlogodds", has been added to -# translate quality values from the Solexa+64 format used in export files prior -# to Pipeline 1.3 -# - The export match descriptor is now reverse-complemented when necessary such that -# it always corresponds to the forward strand of the reference, to be consistent -# with other information in the SAM record. It is now written to the optional -# 'XD' field (rather than 'MD') to acknowledge its minor differences from the -# samtools match descriptor (see additional detail below). -# - An option, "--nofilter", has been added to include reads which have failed -# primary analysis quality filtration. Such reads will have the corresponding -# SAM flag bit (0x0200) set. -# - Labels in the export 'contig' field are preserved by setting RNAME to -# "$export_chromosome/$export_contig" when then contig label exists. -# -# -# Contact: lh3 -# Version: 0.1.2 (03JAN2009) -# -# -# -########## Known Conversion Limitations: -# -# - Export records for reads that map to a position < 1 (allowed in export format), are converted -# to unmapped reads in the SAM record. -# - Export records contain the reserved chromosome names: "NM" and "QC". "NM" indicates that the -# aligner could not map the read to the reference sequence set, and "QC" means that the -# aligner did not attempt to map the read due to some technical limitation. Both of these -# alignment types are collapsed to the single unmapped alignment state in the SAM record. -# - The export match descriptor is slightly different than the samtools match descriptor. For -# this reason it is stored in the optional SAM field 'XD' (and not 'MD'). Note that the -# export match descriptor differs from the samtools version in two respects: (1) indels -# are explicitly closed with the '$' character and (2) insertions must be enumerated in -# the match descriptor. For example a 35-base read with a two-base insertion is described -# as: 20^2$14 -# -# -# - -my $version = "2.0.0"; - -use strict; -use warnings; - -use File::Spec qw(splitpath); -use Getopt::Long; -use List::Util qw(min max); - - -use constant { - EXPORT_INDEX => 6, - EXPORT_READNO => 7, - EXPORT_READ => 8, - EXPORT_QUAL => 9, - EXPORT_CHROM => 10, - EXPORT_CONTIG => 11, - EXPORT_POS => 12, - EXPORT_STRAND => 13, - EXPORT_MD => 14, - EXPORT_SEMAP => 15, - EXPORT_PEMAP => 16, - EXPORT_PASSFILT => 21, -}; - - -use constant { - SAM_QNAME => 0, - SAM_FLAG => 1, - SAM_RNAME => 2, - SAM_POS => 3, - SAM_MAPQ => 4, - SAM_CIGAR => 5, - SAM_MRNM => 6, - SAM_MPOS => 7, - SAM_ISIZE => 8, - SAM_SEQ => 9, - SAM_QUAL => 10, -}; - - -# function prototypes for Richard's code -sub match_desc_to_cigar($); -sub match_desc_frag_length($); -sub reverse_compl_match_descriptor($); -sub write_header($;$;$); - - -&export2sam; -exit; - - - - -sub export2sam { - - my $cmdline = $0 . " " . join(" ",@ARGV); - my $arg_count = scalar @ARGV; - my @spval = File::Spec->splitpath($0); - my $progname = $spval[2]; - - my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values - my $is_nofilter = 0; - my $read1file; - my $read2file; - my $print_version = 0; - my $help = 0; - - my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, - "nofilter" => \$is_nofilter, - "read1=s" => \$read1file, - "read2=s" => \$read2file, - "version" => \$print_version, - "help" => \$help ); - - my $usage = <) { - $export_line_count++; - my (@s1, @s2); - &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter); - if ($is_paired) { - my $read2line = <$fh2>; - if(not $read2line){ - die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n"); - } - &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter); - - if (@s1 && @s2) { # then set mate coordinate - if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){ - die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n"); - } - - my $isize = 0; - if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize - my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS]; - my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS]; - $isize = $x2 - $x1; - } - - foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ - my ($sa,$sb,$is) = @{$_}; - if ($sb->[SAM_RNAME] ne '*') { - $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME]; - $sa->[SAM_MPOS] = $sb->[SAM_POS]; - $sa->[SAM_ISIZE] = $is; - $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10); - } else { - $sa->[SAM_FLAG] |= 0x8; - } - } - } - } - print join("\t", @s1), "\n" if (@s1); - print join("\t", @s2), "\n" if (@s2 && $is_paired); - } - close($fh1); - if($is_paired) { - while(my $read2line = <$fh2>){ - $export_line_count++; - die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n"); - } - close($fh2); - } -} - -sub export2sam_aux { - my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_; - chomp($line); - my @t = split("\t", $line); - @$s = (); - my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y'); - return if(not ($isPassFilt or $is_nofilter)); - # read name - $s->[SAM_QNAME] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]"; - # initial flag (will be updated later) - $s->[SAM_FLAG] = 0; - if($is_paired) { - if($t[EXPORT_READNO] != $read_no){ - die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n"); - } - $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no); - } - $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt); - - # read & quality - my $is_export_rev = ($t[EXPORT_STRAND] eq 'R'); - if ($is_export_rev) { # then reverse the sequence and quality - $s->[SAM_SEQ] = reverse($t[EXPORT_READ]); - $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/; - $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]); - } else { - $s->[SAM_SEQ] = $t[EXPORT_READ]; - $s->[SAM_QUAL] = $t[EXPORT_QUAL]; - } - my @convqual = (); - foreach (unpack('C*', $s->[SAM_QUAL])){ - my $val=$ct->[$_]; - if(not defined $val){ - my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n"; - if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; } - die($msg . "\n"); - } - push @convqual,$val; - } - - $s->[SAM_QUAL] = pack('C*',@convqual); # change coding - - - # coor - my $has_coor = 0; - $s->[SAM_RNAME] = "*"; - if ($t[EXPORT_CHROM] eq 'NM' or $t[EXPORT_CHROM] eq 'QC') { - $s->[SAM_FLAG] |= 0x4; # unmapped - } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) { - $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? - push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") - } elsif ($t[EXPORT_POS] < 1) { - $s->[SAM_FLAG] |= 0x4; # unmapped - } else { - $s->[SAM_RNAME] = $t[EXPORT_CHROM]; - $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne ''); - $has_coor = 1; - } - $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0; - -# print STDERR "t[14] = " . $t[14] . "\n"; - my $matchDesc = ''; - $s->[SAM_CIGAR] = "*"; - if($has_coor){ - $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD]; - - if($matchDesc =~ /\^/){ - # construct CIGAR string using Richard's function - $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing - } else { - $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M"; - } - } - -# print STDERR "cigar_string = $cigar_string\n"; - - $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev); - if($has_coor){ - my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0; - my $pemap = 0; - if($is_paired) { - $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0; - - # set `proper pair' bit if non-blank, non-zero PE alignment score: - $s->[SAM_FLAG] |= 0x02 if ($pemap > 0); - } - $s->[SAM_MAPQ] = min(254,max($semap,$pemap)); - } else { - $s->[SAM_MAPQ] = 0; - } - # mate coordinate - $s->[SAM_MRNM] = '*'; - $s->[SAM_MPOS] = 0; - $s->[SAM_ISIZE] = 0; - # aux - push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]); - if($has_coor){ - # The export match descriptor differs slightly from the samtools match descriptor. - # In order for the converted SAM files to be as compliant as possible, - # we put the export match descriptor in optional field 'XD' rather than 'MD': - push(@$s, "XD:Z:$matchDesc"); - push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne ''); - push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne '')); - } -} - - - -# -# the following code is taken from Richard Shaw's sorted2sam.pl file -# -sub reverse_compl_match_descriptor($) -{ -# print "\nREVERSING THE MATCH DESCRIPTOR!\n"; - my ($match_desc) = @_; - my $rev_compl_match_desc = reverse($match_desc); - $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/; - - # Unreverse the digits of numbers. - $rev_compl_match_desc = join('', - map {($_ =~ /\d+/) - ? join('', reverse(split('', $_))) - : $_} split(/(\d+)/, - $rev_compl_match_desc)); - - return $rev_compl_match_desc; -} - - - -sub match_desc_to_cigar($) -{ - my ($match_desc) = @_; - - my @match_desc_parts = split(/(\^.*?\$)/, $match_desc); - my $cigar_str = ''; - my $cigar_del_ch = 'D'; - my $cigar_ins_ch = 'I'; - my $cigar_match_ch = 'M'; - - foreach my $match_desc_part (@match_desc_parts) { - next if (!$match_desc_part); - - if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) { - # Deletion - $cigar_str .= (length($1) . $cigar_del_ch); - } elsif ($match_desc_part =~ /^\^(\d+)\$$/) { - # Insertion - $cigar_str .= ($1 . $cigar_ins_ch); - } else { - $cigar_str .= (match_desc_frag_length($match_desc_part) - . $cigar_match_ch); - } - } - - return $cigar_str; -} - - -#------------------------------------------------------------------------------ - -sub match_desc_frag_length($) - { - my ($match_desc_str) = @_; - my $len = 0; - - my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str); - - foreach my $match_desc_field (@match_desc_fields) { - next if ($match_desc_field eq ''); - - $len += (($match_desc_field =~ /(\d+)/) - ? $1 : length($match_desc_field)); - } - - return $len; -} - - -# argument holds the command line -sub write_header($;$;$) -{ - my ($progname,$version,$cl) = @_; - my $complete_header = ""; - $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n"; - - return $complete_header; -} +#!/usr/bin/env perl +# +# +# export2sam.pl converts GERALD export files to SAM format. +# +# +# +########## License: +# +# The MIT License +# +# Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. +# Modified SAMtools work copyright (c) 2010 Illumina, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# +# +# +########## ChangeLog: +# +# Version: 2.3.1 (18MAR2011) +# +# - Restore file '-' as stdin input. +# +# Version: 2.3.0 (24JAN2011) +# +# - Add support for export reserved chromosome name "CONTROL", +# which is translated to optional field "XC:Z:CONTROL". +# - Check for ".gz" file extension on export files and open +# these as gzip pipes when the extension is found. +# +# Version: 2.2.0 (16NOV2010) +# +# - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y +# - For export records with reserved chromosome name identifiers +# "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM" +# to the SAM record, so that these cases can be distinguished +# from other unmatched reads. +# +# Version: 2.1.0 (21SEP2010) +# +# - Additional export record error checking. +# - Convert export records with chromomsome value of "RM" to unmapped +# SAM records. +# +# Version: 2.0.0 (15FEB2010) +# +# Script updated by Illumina in conjunction with CASAVA 1.7.0 +# release. +# +# Major changes are as follows: +# - The CIGAR string has been updated to include all gaps from +# ELANDv2 alignments. +# - The ELAND single read alignment score is always stored in the +# optional "SM" field and the ELAND paired read alignment score +# is stored in the optional "AS" field when it exists. +# - The MAPQ value is set to the higher of the two alignment scores, +# but no greater than 254, i.e. min(254,max(SM,AS)) +# - The SAM "proper pair" bit (0x0002) is now set for read pairs +# meeting ELAND's expected orientation and insert size criteria. +# - The default quality score translation is set for export files +# which contain Phread+64 quality values. An option, +# "--qlogodds", has been added to translate quality values from +# the Solexa+64 format used in export files prior to Pipeline +# 1.3 +# - The export match descriptor is now reverse-complemented when +# necessary such that it always corresponds to the forward +# strand of the reference, to be consistent with other +# information in the SAM record. It is now written to the +# optional 'XD' field (rather than 'MD') to acknowledge its +# minor differences from the samtools match descriptor (see +# additional detail below). +# - An option, "--nofilter", has been added to include reads which +# have failed primary analysis quality filtration. Such reads +# will have the corresponding SAM flag bit (0x0200) set. +# - Labels in the export 'contig' field are preserved by setting +# RNAME to "$export_chromosome/$export_contig" when the contig +# label exists. +# +# +# Contact: lh3 +# Version: 0.1.2 (03JAN2009) +# +# +# +########## Known Conversion Limitations: +# +# - Export records for reads that map to a position < 1 (allowed +# in export format), are converted to unmapped reads in the SAM +# record. +# - Export records contain the reserved chromosome names: "NM", +# "QC","RM" and "CONTROL". "NM" indicates that the aligner could +# not map the read to the reference sequence set. "QC" means that +# the aligner did not attempt to map the read due to some +# technical limitation. "RM" means that the read mapped to a set +# of 'contaminant' sequences specified in GERALD's RNA-seq +# workflow. "CONTROL" means that the read is a control. All of +# these alignment types are collapsed to the single unmapped +# alignment state in the SAM record, but the optional SAM "XC" +# field is used to record the original reserved chromosome name of +# the read for all but the "NM" case. +# - The export match descriptor is slightly different than the +# samtools match descriptor. For this reason it is stored in the +# optional SAM field 'XD' (and not 'MD'). Note that the export +# match descriptor differs from the samtools version in two +# respects: (1) indels are explicitly closed with the '$' +# character and (2) insertions must be enumerated in the match +# descriptor. For example a 35-base read with a two-base insertion +# is described as: 20^2$14 +# +# +# + +my $version = "2.3.1"; + +use strict; +use warnings; + +use Getopt::Long; +use File::Spec; +use List::Util qw(min max); + + +use constant { + EXPORT_MACHINE => 0, + EXPORT_RUNNO => 1, + EXPORT_LANE => 2, + EXPORT_TILE => 3, + EXPORT_X => 4, + EXPORT_Y => 5, + EXPORT_INDEX => 6, + EXPORT_READNO => 7, + EXPORT_READ => 8, + EXPORT_QUAL => 9, + EXPORT_CHROM => 10, + EXPORT_CONTIG => 11, + EXPORT_POS => 12, + EXPORT_STRAND => 13, + EXPORT_MD => 14, + EXPORT_SEMAP => 15, + EXPORT_PEMAP => 16, + EXPORT_PASSFILT => 21, + EXPORT_SIZE => 22, +}; + + +use constant { + SAM_QNAME => 0, + SAM_FLAG => 1, + SAM_RNAME => 2, + SAM_POS => 3, + SAM_MAPQ => 4, + SAM_CIGAR => 5, + SAM_MRNM => 6, + SAM_MPOS => 7, + SAM_ISIZE => 8, + SAM_SEQ => 9, + SAM_QUAL => 10, +}; + + +# function prototypes for Richard's code +sub match_desc_to_cigar($); +sub match_desc_frag_length($); +sub reverse_compl_match_descriptor($); +sub write_header($;$;$); + + +&export2sam; +exit; + + + + +sub export2sam { + + my $cmdline = $0 . " " . join(" ",@ARGV); + my $arg_count = scalar @ARGV; + my $progname = (File::Spec->splitpath($0))[2]; + + my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values + my $is_nofilter = 0; + my $read1file; + my $read2file; + my $print_version = 0; + my $help = 0; + + my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, + "nofilter" => \$is_nofilter, + "read1=s" => \$read1file, + "read2=s" => \$read2file, + "version" => \$print_version, + "help" => \$help ); + + my $usage = <) { + $export_line_count++; + my (@s1, @s2); + &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter); + if ($is_paired) { + my $read2line = <$fh2>; + if(not $read2line){ + die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n"); + } + &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter); + + if (@s1 && @s2) { # then set mate coordinate + if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){ + die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n"); + } + + my $isize = 0; + if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize + my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS]; + my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS]; + $isize = $x2 - $x1; + } + + foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ + my ($sa,$sb,$is) = @{$_}; + if ($sb->[SAM_RNAME] ne '*') { + $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME]; + $sa->[SAM_MPOS] = $sb->[SAM_POS]; + $sa->[SAM_ISIZE] = $is; + $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10); + } else { + $sa->[SAM_FLAG] |= 0x8; + } + } + } + } + print join("\t", @s1), "\n" if (@s1); + print join("\t", @s2), "\n" if (@s2 && $is_paired); + } + close($fh1); + if($is_paired) { + while(my $read2line = <$fh2>){ + $export_line_count++; + die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n"); + } + close($fh2); + } +} + +sub export2sam_aux { + my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_; + chomp($line); + my @t = split("\t", $line); + if(scalar(@t) < EXPORT_SIZE) { + my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . EXPORT_SIZE . ".\n"; + $msg.="\t...erroneous export record:\n" . $line . "\n\n"; + die($msg); + } + @$s = (); + my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y'); + return if(not ($isPassFilt or $is_nofilter)); + # read name + my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : ""); + $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]), + int($t[EXPORT_X]), int($t[EXPORT_Y])); + # initial flag (will be updated later) + $s->[SAM_FLAG] = 0; + if($is_paired) { + if($t[EXPORT_READNO] != $read_no){ + die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n"); + } + $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no); + } + $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt); + + # read & quality + my $is_export_rev = ($t[EXPORT_STRAND] eq 'R'); + if ($is_export_rev) { # then reverse the sequence and quality + $s->[SAM_SEQ] = reverse($t[EXPORT_READ]); + $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/; + $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]); + } else { + $s->[SAM_SEQ] = $t[EXPORT_READ]; + $s->[SAM_QUAL] = $t[EXPORT_QUAL]; + } + my @convqual = (); + foreach (unpack('C*', $s->[SAM_QUAL])){ + my $val=$ct->[$_]; + if(not defined $val){ + my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n"; + if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; } + die($msg . "\n"); + } + push @convqual,$val; + } + + $s->[SAM_QUAL] = pack('C*',@convqual); # change coding + + + # coor + my $has_coor = 0; + $s->[SAM_RNAME] = "*"; + if (($t[EXPORT_CHROM] eq 'NM') or + ($t[EXPORT_CHROM] eq 'QC') or + ($t[EXPORT_CHROM] eq 'RM') or + ($t[EXPORT_CHROM] eq 'CONTROL')) { + $s->[SAM_FLAG] |= 0x4; # unmapped + push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM'); + } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) { + $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? + push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") + } elsif ($t[EXPORT_POS] < 1) { + $s->[SAM_FLAG] |= 0x4; # unmapped + } else { + $s->[SAM_RNAME] = $t[EXPORT_CHROM]; + $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne ''); + $has_coor = 1; + } + $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0; + +# print STDERR "t[14] = " . $t[14] . "\n"; + my $matchDesc = ''; + $s->[SAM_CIGAR] = "*"; + if($has_coor){ + $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD]; + + if($matchDesc =~ /\^/){ + # construct CIGAR string using Richard's function + $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing + } else { + $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M"; + } + } + +# print STDERR "cigar_string = $cigar_string\n"; + + $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev); + if($has_coor){ + my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0; + my $pemap = 0; + if($is_paired) { + $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0; + + # set `proper pair' bit if non-blank, non-zero PE alignment score: + $s->[SAM_FLAG] |= 0x02 if ($pemap > 0); + } + $s->[SAM_MAPQ] = min(254,max($semap,$pemap)); + } else { + $s->[SAM_MAPQ] = 0; + } + # mate coordinate + $s->[SAM_MRNM] = '*'; + $s->[SAM_MPOS] = 0; + $s->[SAM_ISIZE] = 0; + # aux + push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]); + if($has_coor){ + # The export match descriptor differs slightly from the samtools match descriptor. + # In order for the converted SAM files to be as compliant as possible, + # we put the export match descriptor in optional field 'XD' rather than 'MD': + push(@$s, "XD:Z:$matchDesc"); + push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne ''); + push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne '')); + } +} + + + +# +# the following code is taken from Richard Shaw's sorted2sam.pl file +# +sub reverse_compl_match_descriptor($) +{ +# print "\nREVERSING THE MATCH DESCRIPTOR!\n"; + my ($match_desc) = @_; + my $rev_compl_match_desc = reverse($match_desc); + $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/; + + # Unreverse the digits of numbers. + $rev_compl_match_desc = join('', + map {($_ =~ /\d+/) + ? join('', reverse(split('', $_))) + : $_} split(/(\d+)/, + $rev_compl_match_desc)); + + return $rev_compl_match_desc; +} + + + +sub match_desc_to_cigar($) +{ + my ($match_desc) = @_; + + my @match_desc_parts = split(/(\^.*?\$)/, $match_desc); + my $cigar_str = ''; + my $cigar_del_ch = 'D'; + my $cigar_ins_ch = 'I'; + my $cigar_match_ch = 'M'; + + foreach my $match_desc_part (@match_desc_parts) { + next if (!$match_desc_part); + + if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) { + # Deletion + $cigar_str .= (length($1) . $cigar_del_ch); + } elsif ($match_desc_part =~ /^\^(\d+)\$$/) { + # Insertion + $cigar_str .= ($1 . $cigar_ins_ch); + } else { + $cigar_str .= (match_desc_frag_length($match_desc_part) + . $cigar_match_ch); + } + } + + return $cigar_str; +} + + +#------------------------------------------------------------------------------ + +sub match_desc_frag_length($) + { + my ($match_desc_str) = @_; + my $len = 0; + + my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str); + + foreach my $match_desc_field (@match_desc_fields) { + next if ($match_desc_field eq ''); + + $len += (($match_desc_field =~ /(\d+)/) + ? $1 : length($match_desc_field)); + } + + return $len; +} + + +# argument holds the command line +sub write_header($;$;$) +{ + my ($progname,$version,$cl) = @_; + my $complete_header = ""; + $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n"; + + return $complete_header; +} diff --git a/sam.c b/sam.c index ecdee02..e195b0f 100644 --- a/sam.c +++ b/sam.c @@ -40,9 +40,9 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) { samfile_t *fp; fp = (samfile_t*)calloc(1, sizeof(samfile_t)); - if (mode[0] == 'r') { // read + if (strchr(mode, 'r')) { // read fp->type |= TYPE_READ; - if (mode[1] == 'b') { // binary + if (strchr(mode, 'b')) { // binary fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp->x.bam == 0) goto open_err_ret; @@ -63,11 +63,15 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); } - } else if (mode[0] == 'w') { // write + } else if (strchr(mode, 'w')) { // write fp->header = bam_header_dup((const bam_header_t*)aux); - if (mode[1] == 'b') { // binary + if (strchr(mode, 'b')) { // binary char bmode[3]; - bmode[0] = 'w'; bmode[1] = strstr(mode, "u")? 'u' : 0; bmode[2] = 0; + int i, compress_level = -1; + for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0; fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode); if (fp->x.bam == 0) goto open_err_ret; @@ -76,11 +80,11 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) // open file fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; if (fp->x.tamr == 0) goto open_err_ret; - if (strstr(mode, "X")) fp->type |= BAM_OFSTR<<2; - else if (strstr(mode, "x")) fp->type |= BAM_OFHEX<<2; + if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2; + else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2; else fp->type |= BAM_OFDEC<<2; // write header - if (strstr(mode, "h")) { + if (strchr(mode, 'h')) { int i; bam_header_t *alt; // parse the header text diff --git a/sam_view.c b/sam_view.c index eb69449..170bd84 100644 --- a/sam_view.c +++ b/sam_view.c @@ -82,7 +82,7 @@ static int usage(int is_long_help); int main_samview(int argc, char *argv[]) { - int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0, is_count = 0; + int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, compress_level = -1, is_bamout = 0, slx2sngr = 0, is_count = 0; int of_type = BAM_OFDEC, is_long_help = 0; int count = 0; samfile_t *in = 0, *out = 0; @@ -90,7 +90,7 @@ int main_samview(int argc, char *argv[]) /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "Sbct:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) { + while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:CR:")) >= 0) { switch (c) { case 'c': is_count = 1; break; case 'C': slx2sngr = 1; break; @@ -103,7 +103,8 @@ int main_samview(int argc, char *argv[]) case 'f': g_flag_on = strtol(optarg, 0, 0); break; case 'F': g_flag_off = strtol(optarg, 0, 0); break; case 'q': g_min_mapQ = atoi(optarg); break; - case 'u': is_uncompressed = 1; break; + case 'u': compress_level = 0; break; + case '1': compress_level = 1; break; case 'l': g_library = strdup(optarg); break; case 'r': g_rg = strdup(optarg); break; case 'R': fn_rg = strdup(optarg); break; @@ -114,7 +115,7 @@ int main_samview(int argc, char *argv[]) default: return usage(is_long_help); } } - if (is_uncompressed) is_bamout = 1; + if (compress_level >= 0) is_bamout = 1; if (is_header_only) is_header = 1; if (is_bamout) strcat(out_mode, "b"); else { @@ -123,7 +124,11 @@ int main_samview(int argc, char *argv[]) } if (is_bamin) strcat(in_mode, "b"); if (is_header) strcat(out_mode, "h"); - if (is_uncompressed) strcat(out_mode, "u"); + if (compress_level >= 0) { + char tmp[2]; + tmp[0] = compress_level + '0'; tmp[1] = '\0'; + strcat(out_mode, tmp); + } if (argc == optind) return usage(is_long_help); // potential memory leak... // read the list of read groups @@ -231,6 +236,7 @@ static int usage(int is_long_help) fprintf(stderr, " -H print header only (no alignments)\n"); fprintf(stderr, " -S input is SAM\n"); fprintf(stderr, " -u uncompressed BAM output (force -b)\n"); + fprintf(stderr, " -1 fast compression (force -b)\n"); fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n"); fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n"); fprintf(stderr, " -c print only the count of matching records\n"); diff --git a/samtools.1 b/samtools.1 index e0c7743..ba32392 100644 --- a/samtools.1 +++ b/samtools.1 @@ -1,4 +1,4 @@ -.TH samtools 1 "1 March 2011" "samtools-0.1.13" "Bioinformatics tools" +.TH samtools 1 "16 March 2011" "samtools-0.1.14" "Bioinformatics tools" .SH NAME .PP samtools - Utilities for the Sequence Alignment/Map (SAM) format @@ -145,7 +145,10 @@ identifiers are absent, each input file is regarded as one sample. .B OPTIONS: .RS -.TP 8 +.TP 10 +.B -A +Do not skip anomalous read pairs in variant calling. +.TP .B -B Disable probabilistic realignment for the computation of base alignment quality (BAQ). BAQ is the Phred-scaled probability of a read base being @@ -159,6 +162,14 @@ being generated from the mapped position, the new mapping quality is about sqrt((INT-q)/INT)*INT. A zero value disables this functionality; if enabled, the recommended value for BWA is 50. [0] .TP +.BI -d \ INT +At a position, read maximally +.I INT +reads per input BAM. [250] +.TP +.B -D +Output per-sample read depth +.TP .BI -e \ INT Phred-scaled gap extension sequencing error probability. Reducing .I INT @@ -184,9 +195,17 @@ is modeled as .IR INT * s / l . [100] .TP +.B -I +Do not perform INDEL calling +.TP .BI -l \ FILE File containing a list of sites where pileup or BCF is outputted [null] .TP +.BI -L \ INT +Skip INDEL calling if the average per-sample depth is above +.IR INT . +[250] +.TP .BI -o \ INT Phred-scaled gap open sequencing error probability. Reducing .I INT @@ -210,6 +229,9 @@ Only generate pileup in region .I STR [all sites] .TP +.B -S +Output per-sample Phred-scaled strand bias P-value +.TP .B -u Similar to .B -g @@ -227,6 +249,16 @@ with the header in This command is much faster than replacing the header with a BAM->SAM->BAM conversion. +.TP +.B cat +samtools cat [-h header.sam] [-o out.bam] [ ... ] + +Concatenate BAMs. The sequence dictionary of each input BAM must be identical, +although this command does not check this. This command uses a similar trick +to +.B reheader +which enables fast BAM concatenation. + .TP .B sort samtools sort [-no] [-m maxMem] -- 2.30.2