KNETFILE_O= knetfile.o
LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \
- $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o
+ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o
AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
+Beta release 0.1.14 (16 March, 2011)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This release implements a method for testing associations for case-control
+data. The method does not call genotypes but instead sums over all genotype
+configurations to compute a chi^2 based test statistics. It can be potentially
+applied to comparing a pair of samples (e.g. a tumor-normal pair), but this
+has not been evaluated on real data.
+
+Another new feature is to make X chromosome variant calls when female and male
+samples are both present. The user needs to provide a file indicating the
+ploidy of each sample.
+
+Other new features:
+
+ * Added `samtools mpileup -L' to skip INDEL calling in regions with
+ excessively high coverage. Such regions dramatically slow down mpileup.
+
+ * Added `bcftools view -F' to parse BCF files generated by samtools r921 or
+ older which encode PL in a different way.
+
+(0.1.14: 16 March 2011, r930:163)
+
+
+
Beta release 0.1.13 (1 March, 2011)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Updated the BCF spec.
* Added the `FQ' VCF INFO field, which gives the phred-scaled probability
- of all samples being the samei (identical to the reference or all homozygous
+ of all samples being the same (identical to the reference or all homozygous
variants). Option `view -f' has been dropped.
* Implementated of "vcfutils.pl vcf2fq" to generate a consensus sequence
@copyright Genome Research Ltd.
*/
-#define BAM_VERSION "0.1.13 (r926:134)"
+#define BAM_VERSION "0.1.14 (r933:170)"
#include <stdint.h>
#include <stdlib.h>
--- /dev/null
+/*\r
+\r
+bam_cat -- efficiently concatenates bam files\r
+\r
+bam_cat can be used to concatenate BAM files. Under special\r
+circumstances, it can be used as an alternative to 'samtools merge' to\r
+concatenate multiple sorted files into a single sorted file. For this\r
+to work each file must be sorted, and the sorted files must be given\r
+as command line arguments in order such that the final read in file i\r
+is less than or equal to the first read in file i+1.\r
+\r
+This code is derived from the bam_reheader function in samtools 0.1.8\r
+and modified to perform concatenation by Chris Saunders on behalf of\r
+Illumina.\r
+\r
+\r
+########## License:\r
+\r
+The MIT License\r
+\r
+Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.\r
+Modified SAMtools work copyright (c) 2010 Illumina, Inc.\r
+\r
+Permission is hereby granted, free of charge, to any person obtaining a copy\r
+of this software and associated documentation files (the "Software"), to deal\r
+in the Software without restriction, including without limitation the rights\r
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\r
+copies of the Software, and to permit persons to whom the Software is\r
+furnished to do so, subject to the following conditions:\r
+\r
+The above copyright notice and this permission notice shall be included in\r
+all copies or substantial portions of the Software.\r
+\r
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\r
+THE SOFTWARE.\r
+\r
+*/\r
+\r
+\r
+/*\r
+makefile:\r
+"""\r
+CC=gcc\r
+CFLAGS+=-g -Wall -O2 -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -I$(SAMTOOLS_DIR)\r
+LDFLAGS+=-L$(SAMTOOLS_DIR)\r
+LDLIBS+=-lbam -lz\r
+\r
+all:bam_cat\r
+"""\r
+*/\r
+\r
+\r
+#include <stdio.h>\r
+#include <stdlib.h>\r
+#include <unistd.h>\r
+\r
+#include "bgzf.h"\r
+#include "bam.h"\r
+\r
+#define BUF_SIZE 0x10000\r
+\r
+#define GZIPID1 31\r
+#define GZIPID2 139\r
+\r
+#define BGZF_EMPTY_BLOCK_SIZE 28\r
+\r
+\r
+int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam)\r
+{\r
+ BGZF *fp;\r
+ FILE* fp_file;\r
+ uint8_t *buf;\r
+ uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];\r
+ const int es=BGZF_EMPTY_BLOCK_SIZE;\r
+ int i;\r
+ \r
+ fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");\r
+ if (fp == 0) {\r
+ fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam);\r
+ return 1;\r
+ }\r
+ if (h) bam_header_write(fp, h);\r
+ \r
+ buf = (uint8_t*) malloc(BUF_SIZE);\r
+ for(i = 0; i < nfn; ++i){\r
+ BGZF *in;\r
+ bam_header_t *old;\r
+ int len,j;\r
+ \r
+ in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(fileno(stdin), "r");\r
+ if (in == 0) {\r
+ fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);\r
+ return -1;\r
+ }\r
+ if (in->open_mode != 'r') return -1;\r
+ \r
+ old = bam_header_read(in);\r
+ if (h == 0 && i == 0) bam_header_write(fp, old);\r
+ \r
+ if (in->block_offset < in->block_length) {\r
+ bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);\r
+ bgzf_flush(fp);\r
+ }\r
+ \r
+ j=0;\r
+#ifdef _USE_KNETFILE\r
+ fp_file=fp->x.fpw;\r
+ while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) {\r
+#else \r
+ fp_file=fp->file;\r
+ while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) {\r
+#endif\r
+ if(len<es){\r
+ int diff=es-len;\r
+ if(j==0) {\r
+ fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]);\r
+ return -1;\r
+ }\r
+ fwrite(ebuf, 1, len, fp_file);\r
+ memcpy(ebuf,ebuf+len,diff);\r
+ memcpy(ebuf+diff,buf,len);\r
+ } else {\r
+ if(j!=0) fwrite(ebuf, 1, es, fp_file);\r
+ len-= es;\r
+ memcpy(ebuf,buf+len,es);\r
+ fwrite(buf, 1, len, fp_file);\r
+ }\r
+ j=1;\r
+ }\r
+\r
+ /* check final gzip block */\r
+ {\r
+ const uint8_t gzip1=ebuf[0];\r
+ const uint8_t gzip2=ebuf[1];\r
+ const uint32_t isize=*((uint32_t*)(ebuf+es-4));\r
+ if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {\r
+ fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]);\r
+ fprintf(stderr, " Possible output corruption.\n");\r
+ fwrite(ebuf, 1, es, fp_file);\r
+ }\r
+ }\r
+ bam_header_destroy(old);\r
+ bgzf_close(in);\r
+ }\r
+ free(buf);\r
+ bgzf_close(fp);\r
+ return 0;\r
+}\r
+\r
+\r
+\r
+int main_cat(int argc, char *argv[])\r
+{\r
+ bam_header_t *h = 0;\r
+ char *outfn = 0;\r
+ int c, ret;\r
+ while ((c = getopt(argc, argv, "h:o:")) >= 0) {\r
+ switch (c) {\r
+ case 'h': {\r
+ tamFile fph = sam_open(optarg);\r
+ if (fph == 0) {\r
+ fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]);\r
+ return 1;\r
+ }\r
+ h = sam_header_read(fph);\r
+ sam_close(fph);\r
+ break;\r
+ }\r
+ case 'o': outfn = strdup(optarg); break;\r
+ }\r
+ }\r
+ if (argc - optind < 2) {\r
+ fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n");\r
+ return 1;\r
+ }\r
+ ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");\r
+ free(outfn);\r
+ return ret;\r
+}\r
#define MPLP_FMT_SP 0x200
#define MPLP_NO_INDEL 0x400
#define MPLP_EXT_BAQ 0x800
+#define MPLP_ILLUMINA13 0x1000
typedef struct {
- int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth;
+ int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth;
int openQ, extQ, tandemQ, min_support; // for indels
double min_frac; // for indels
char *reg, *fn_pos, *pl_list;
skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0);
if (skip) continue;
}
+ if (ma->conf->flag & MPLP_ILLUMINA13) {
+ int i;
+ uint8_t *qual = bam1_qual(b);
+ for (i = 0; i < b->core.l_qseq; ++i)
+ qual[i] = qual[i] > 31? qual[i] - 31 : 0;
+ }
has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0;
skip = 0;
if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_EXT_BAQ)? 3 : 1);
extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
extern void bcf_call_del_rghash(void *rghash);
mplp_aux_t **data;
- int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid, max_depth;
+ int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid, max_depth, max_indel_depth;
const bam_pileup1_t **plp;
bam_mplp_t iter;
bam_header_t *h = 0;
max_depth = 8000 / sm->n;
fprintf(stderr, "<%s> Set max per-sample depth to %d\n", __func__, max_depth);
}
+ max_indel_depth = conf->max_indel_depth * sm->n;
bam_mplp_set_maxcnt(iter, max_depth);
while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
ref_tid = tid;
}
if (conf->flag & MPLP_GLF) {
- int _ref0, ref16;
+ int total_depth, _ref0, ref16;
bcf1_t *b = calloc(1, sizeof(bcf1_t));
+ for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp);
_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
ref16 = bam_nt16_table[_ref0];
bcf_write(bp, bh, b);
bcf_destroy(b);
// call indels
- if (!(conf->flag&MPLP_NO_INDEL) && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
+ if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
for (i = 0; i < gplp.n; ++i)
bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
mplp.max_mq = 60;
mplp.min_baseQ = 13;
mplp.capQ_thres = 0;
- mplp.max_depth = 250;
+ mplp.max_depth = 250; mplp.max_indel_depth = 250;
mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
mplp.min_frac = 0.002; mplp.min_support = 1;
mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN;
- while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:b:P:o:e:h:Im:F:EG:")) >= 0) {
+ while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:6")) >= 0) {
switch (c) {
case 'f':
mplp.fai = fai_load(optarg);
case 'S': mplp.flag |= MPLP_FMT_SP; break;
case 'I': mplp.flag |= MPLP_NO_INDEL; break;
case 'E': mplp.flag |= MPLP_EXT_BAQ; break;
+ case '6': mplp.flag |= MPLP_ILLUMINA13; break;
case 'C': mplp.capQ_thres = atoi(optarg); break;
case 'M': mplp.max_mq = atoi(optarg); break;
case 'q': mplp.min_mq = atoi(optarg); break;
case 'A': use_orphan = 1; break;
case 'F': mplp.min_frac = atof(optarg); break;
case 'm': mplp.min_support = atoi(optarg); break;
+ case 'L': mplp.max_indel_depth = atoi(optarg); break;
case 'G': {
FILE *fp_rg;
char buf[1024];
fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq);
fprintf(stderr, " -Q INT min base quality [%d]\n", mplp.min_baseQ);
fprintf(stderr, " -q INT filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq);
- fprintf(stderr, " -d INT max per-sample depth [%d]\n", mplp.max_depth);
+ fprintf(stderr, " -d INT max per-BAM depth [%d]\n", mplp.max_depth);
+ fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth);
fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n");
fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ);
fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ);
fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support);
fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac);
fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n");
+ fprintf(stderr, " -6 quality is in the Illumina-1.3+ encoding\n");
fprintf(stderr, " -A use anomalous read pairs in SNP/INDEL calling\n");
fprintf(stderr, " -g generate BCF output\n");
fprintf(stderr, " -u do not compress BCF output\n");
static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout)
{
- char *name;
+ char *name, mode[3];
int i;
bamFile fp;
ks_mergesort(sort, k, buf, 0);
name = (char*)calloc(strlen(prefix) + 20, 1);
- if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n);
- else sprintf(name, "%s.bam", prefix);
- fp = is_stdout? bam_dopen(fileno(stdout), "w") : bam_open(name, "w");
+ if (n >= 0) {
+ sprintf(name, "%s.%.4d.bam", prefix, n);
+ strcpy(mode, "w1");
+ } else {
+ sprintf(name, "%s.bam", prefix);
+ strcpy(mode, "w");
+ }
+ fp = is_stdout? bam_dopen(fileno(stdout), mode) : bam_open(name, mode);
if (fp == 0) {
fprintf(stderr, "[sort_blocks] fail to create file %s.\n", name);
free(name);
int main_reheader(int argc, char *argv[]);
int main_cut_target(int argc, char *argv[]);
int main_phase(int argc, char *argv[]);
+int main_cat(int argc, char *argv[]);
int faidx_main(int argc, char *argv[]);
int glf3_view_main(int argc, char *argv[]);
fprintf(stderr, " merge merge sorted alignments\n");
fprintf(stderr, " rmdup remove PCR duplicates\n");
fprintf(stderr, " reheader replace BAM header\n");
+ fprintf(stderr, " cat concatenate BAMs\n");
fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n");
fprintf(stderr, " phase phase heterozygotes\n");
fprintf(stderr, "\n");
else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1);
else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1);
else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1);
+ else if (strcmp(argv[1], "cat") == 0) return main_cat(argc-1, argv+1);
else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1);
else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1);
#if _CURSES_LIB != 0
int bcf_is_indel(const bcf1_t *b);
bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list);
int bcf_subsam(int n_smpl, int *list, bcf1_t *b);
+ // move GT to the first FORMAT field
+ int bcf_fix_gt(bcf1_t *b);
+ // update PL generated by old samtools
+ int bcf_fix_pl(bcf1_t *b);
// string hash table
void *bcf_build_refhash(bcf_hdr_t *h);
\end{center}
\begin{center}
-\begin{tabular}{cll}
+\begin{tabular}{clp{9cm}}
\hline
\multicolumn{1}{l}{\bf Field} & \multicolumn{1}{l}{\bf Type} & \multicolumn{1}{l}{\bf Description} \\\hline
{\tt DP} & {\tt uint16\_t[n]} & Read depth \\
{\tt GL} & {\tt float[n*G]} & Log10 likelihood of data; $G=\frac{A(A+1)}{2}$, $A=\#\{alleles\}$\\
{\tt GT} & {\tt uint8\_t[n]} & {\tt missing\char60\char60 7 | phased\char60\char60 6 | allele1\char60\char60 3 | allele2} \\
-{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$} \\
+{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$. If the highest bit is set,
+ the allele is not present (e.g. due to different ploidy between samples).} \\
{\tt GQ} & {\tt uint8\_t[n]} & {Genotype quality}\\
{\tt HQ} & {\tt uint8\_t[n*2]} & {Haplotype quality}\\
{\tt \_HQ} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic HQ}\\
BCF proposal).
\item Predefined {\tt FORMAT} fields can be missing from VCF headers, but custom {\tt FORMAT} fields
are required to be explicitly defined in the headers.
-\item A {\tt FORMAT} field with its name starting with `{\tt \_}' gives an alternative
- binary representation of the corresponding VCF field. The alternative representation
- is used when the default representation is unable to keep the genotype information,
- for example, when the ploidy is over 2 or there are more than 8 alleles.
+\item A {\tt FORMAT} field with its name starting with `{\tt \_}' is specific to BCF only.
+ It gives an alternative binary representation of the corresponding VCF field, in case
+ the default representation is unable to keep the genotype information,
+ for example, when the ploidy is not 2 or there are more than 8 alleles.
\end{itemize}
\end{document}
-.TH bcftools 1 "2 October 2010" "bcftools" "Bioinformatics tools"
+.TH bcftools 1 "16 March 2011" "bcftools" "Bioinformatics tools"
.SH NAME
.PP
bcftools - Utilities for the Binary Call Format (BCF) and VCF.
.TP 10
.B view
.B bcftools view
-.RB [ \-cbuSAGgHvNQ ]
-.RB [ \-1
-.IR nGroup1 ]
+.RB [ \-AbFGNQSucgv ]
+.RB [ \-D
+.IR seqDict ]
.RB [ \-l
-.IR listFile ]
+.IR listLoci ]
+.RB [ \-s
+.IR listSample ]
+.RB [ \-i
+.IR gapSNPratio ]
.RB [ \-t
.IR mutRate ]
.RB [ \-p
.IR varThres ]
.RB [ \-P
.IR prior ]
+.RB [ \-1
+.IR nGroup1 ]
+.RB [ \-d
+.IR minFrac ]
+.RB [ \-U
+.IR nPerm ]
+.RB [ \-X
+.IR permThres ]
.I in.bcf
.RI [ region ]
Convert between BCF and VCF, call variant candidates and estimate allele
frequencies.
-.B OPTIONS:
.RS
+.TP
+.B Input/Output Options:
+.TP 10
+.B -A
+Retain all possible alternate alleles at variant sites. By default, the view
+command discards unlikely alleles.
.TP 10
.B -b
Output in the BCF format. The default is VCF.
.TP
-.B -c
-Call variants.
-.TP
-.B -v
-Output variant sites only (force -c)
-.TP
-.B -g
-Call per-sample genotypes at variant sites (force -c)
+.BI -D \ FILE
+Sequence dictionary (list of chromosome names) for VCF->BCF conversion [null]
.TP
-.B -u
-Uncompressed BCF output (force -b).
-.TP
-.B -S
-The input is VCF instead of BCF.
-.TP
-.B -A
-Retain all possible alternate alleles at variant sites. By default, this
-command discards unlikely alleles.
+.B -F
+Indicate PL is generated by r921 or before (ordering is different).
.TP
.B -G
Suppress all individual genotype information.
.TP
-.B -H
-Perform Hardy-Weiberg Equilibrium test. This will add computation time, sometimes considerably.
+.BI -l \ FILE
+List of sites at which information are outputted [all sites]
.TP
.B -N
Skip sites where the REF field is not A/C/G/T
.B -Q
Output the QCALL likelihood format
.TP
-.B -f
-Reference-free variant calling mode. In this mode, the prior will be
-folded; a variant is called iff the sample(s) contains at least two
-alleles; the QUAL field in the VCF/BCF output is changed accordingly.
+.BI -s \ FILE
+List of samples to use. The first column in the input gives the sample names
+and the second gives the ploidy, which can only be 1 or 2. When the 2nd column
+is absent, the sample ploidy is assumed to be 2. In the output, the ordering of
+samples will be identical to the one in
+.IR FILE .
+[null]
.TP
-.BI "-1 " INT
-Number of group-1 samples. This option is used for dividing input into
-two groups for comparing. A zero value disables this functionality. [0]
+.B -S
+The input is VCF instead of BCF.
.TP
-.BI "-l " FILE
-List of sites at which information are outputted [all sites]
+.B -u
+Uncompressed BCF output (force -b).
.TP
-.BI "-t " FLOAT
-Scaled muttion rate for variant calling [0.001]
+.B Consensus/Variant Calling Options:
+.TP 10
+.B -c
+Call variants.
+.TP
+.BI -d \ FLOAT
+When
+.B -v
+is in use, skip loci where the fraction of samples covered by reads is below FLOAT. [0]
+.TP
+.B -g
+Call per-sample genotypes at variant sites (force -c)
.TP
-.BI "-p " FLOAT
+.BI -i \ FLOAT
+Ratio of INDEL-to-SNP mutation rate [0.15]
+.TP
+.BI -p \ FLOAT
A site is considered to be a variant if P(ref|D)<FLOAT [0.5]
.TP
-.BI "-P " STR
+.BI -P \ STR
Prior or initial allele frequency spectrum. If STR can be
.IR full ,
.IR cond2 ,
.I flat
or the file consisting of error output from a previous variant calling
run.
+.TP
+.BI -t \ FLOAT
+Scaled muttion rate for variant calling [0.001]
+.TP
+.B -v
+Output variant sites only (force -c)
+.TP
+.B Contrast Calling and Association Test Options:
+.TP
+.BI -1 \ INT
+Number of group-1 samples. This option is used for dividing the samples into
+two groups for contrast SNP calling or association test.
+When this option is in use, the following VCF INFO will be outputted:
+PC2, PCHI2 and QCHI2. [0]
+.TP
+.BI -U \ INT
+Number of permutations for association test (effective only with
+.BR -1 )
+[0]
+.TP
+.BI -X \ FLOAT
+Only perform permutations for P(chi^2)<FLOAT (effective only with
+.BR -U )
+[0.01]
.RE
.TP
Concatenate BCF files. The input files are required to be sorted and
have identical samples appearing in the same order.
.RE
+
+.SH BCFTOOLS SPECIFIC VCF TAGS
+
+.TS
+center box;
+cb | cb | cb
+l | l | l .
+Tag Format Description
+_
+AF1 double Max-likelihood estimate of the site allele frequency (AF) of the first ALT allele
+CI95 double[2] Equal-tail Bayesian credible interval of AF at the 95% level
+DP int Raw read depth (without quality filtering)
+DP4 int[4] # high-quality reference forward bases, ref reverse, alternate for and alt rev bases
+FQ int Consensus quality. Positive: sample genotypes different; negative: otherwise
+MQ int Root-Mean-Square mapping quality of covering reads
+PC2 int[2] Phred probability of AF in group1 samples being larger (,smaller) than in group2
+PCHI2 double Posterior weighted chi^2 P-value between group1 and group2 samples
+PV4 double[4] P-value for strand bias, baseQ bias, mapQ bias and tail distance bias
+QCHI2 int Phred-scaled PCHI2
+RP int # permutations yielding a smaller PCHI2
+.TE
return 0;
}
+int bcf_fix_pl(bcf1_t *b)
+{
+ int i;
+ uint32_t tmp;
+ uint8_t *PL, *swap;
+ bcf_ginfo_t *gi;
+ // pinpoint PL
+ tmp = bcf_str2int("PL", 2);
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == tmp) break;
+ if (i == b->n_gi) return 0;
+ // prepare
+ gi = b->gi + i;
+ PL = (uint8_t*)gi->data;
+ swap = alloca(gi->len);
+ // loop through individuals
+ for (i = 0; i < b->n_smpl; ++i) {
+ int k, l, x;
+ uint8_t *PLi = PL + i * gi->len;
+ memcpy(swap, PLi, gi->len);
+ for (k = x = 0; k < b->n_alleles; ++k)
+ for (l = k; l < b->n_alleles; ++l)
+ PLi[l*(l+1)/2 + k] = swap[x++];
+ }
+ return 0;
+}
+
+int bcf_smpl_covered(const bcf1_t *b)
+{
+ int i, j, n = 0;
+ uint32_t tmp;
+ bcf_ginfo_t *gi;
+ // pinpoint PL
+ tmp = bcf_str2int("PL", 2);
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == tmp) break;
+ if (i == b->n_gi) return 0;
+ // count how many samples having PL!=[0..0]
+ gi = b->gi + i;
+ for (i = 0; i < b->n_smpl; ++i) {
+ uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len;
+ for (j = 0; j < gi->len; ++j)
+ if (PLi[j]) break;
+ if (j < gi->len) ++n;
+ }
+ return n;
+}
+
static void *locate_field(const bcf1_t *b, const char *fmt, int l)
{
int i;
return 0;
}
+// FIXME: only data are shuffled; the header is NOT
+int bcf_shuffle(bcf1_t *b, int seed)
+{
+ int i, j, *a;
+ if (seed > 0) srand48(seed);
+ a = malloc(b->n_smpl * sizeof(int));
+ for (i = 0; i < b->n_smpl; ++i) a[i] = i;
+ for (i = b->n_smpl; i > 1; --i) {
+ int tmp;
+ j = (int)(drand48() * i);
+ tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
+ }
+ for (j = 0; j < b->n_gi; ++j) {
+ bcf_ginfo_t *gi = b->gi + j;
+ uint8_t *swap, *data = (uint8_t*)gi->data;
+ swap = malloc(gi->len * b->n_smpl);
+ for (i = 0; i < b->n_smpl; ++i)
+ memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len);
+ free(gi->data);
+ gi->data = swap;
+ }
+ free(a);
+ return 0;
+}
+
bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list)
{
int i, ret, j;
kstring_t s;
s.l = s.m = 0; s.s = 0;
hash = kh_init(str2id);
- for (i = 0; i < n; ++i)
- k = kh_put(str2id, hash, samples[i], &ret);
- for (i = j = 0; i < h0->n_smpl; ++i) {
- if (kh_get(str2id, hash, h0->sns[i]) != kh_end(hash)) {
- list[j++] = i;
- kputs(h0->sns[i], &s); kputc('\0', &s);
+ for (i = 0; i < h0->n_smpl; ++i) {
+ k = kh_put(str2id, hash, h0->sns[i], &ret);
+ kh_val(hash, k) = i;
+ }
+ for (i = j = 0; i < n; ++i) {
+ k = kh_get(str2id, hash, samples[i]);
+ if (k != kh_end(hash)) {
+ list[j++] = kh_val(hash, k);
+ kputs(samples[i], &s); kputc('\0', &s);
}
}
if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j);
return h;
}
-int bcf_subsam(int n_smpl, int *list, bcf1_t *b) // list MUST BE sorted
+int bcf_subsam(int n_smpl, int *list, bcf1_t *b)
{
int i, j;
for (j = 0; j < b->n_gi; ++j) {
bcf_ginfo_t *gi = b->gi + j;
+ uint8_t *swap;
+ swap = malloc(gi->len * b->n_smpl);
for (i = 0; i < n_smpl; ++i)
- memcpy((uint8_t*)gi->data + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
+ memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
+ free(gi->data);
+ gi->data = swap;
}
b->n_smpl = n_smpl;
return 0;
#include "bcf.h"
#include "prob1.h"
#include "kstring.h"
+#include "time.h"
#include "khash.h"
KHASH_SET_INIT_INT64(set64)
#define VC_VARONLY 16
#define VC_VCFIN 32
#define VC_UNCOMP 64
-#define VC_HWE 128
#define VC_KEEPALT 256
#define VC_ACGT_ONLY 512
#define VC_QCALL 1024
#define VC_ADJLD 4096
#define VC_NO_INDEL 8192
#define VC_ANNO_MAX 16384
+#define VC_FIX_PL 32768
typedef struct {
- int flag, prior_type, n1, n_sub, *sublist;
+ int flag, prior_type, n1, n_sub, *sublist, n_perm;
char *fn_list, *prior_file, **subsam, *fn_dict;
- double theta, pref, indel_frac;
+ uint8_t *ploidy;
+ double theta, pref, indel_frac, min_perm_p, min_smpl_frac;
} viewconf_t;
khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h)
return hash;
}
-static double test_hwe(const double g[3])
-{
- extern double kf_gammaq(double p, double x);
- double fexp, chi2, f[3], n;
- int i;
- n = g[0] + g[1] + g[2];
- fexp = (2. * g[2] + g[1]) / (2. * n);
- if (fexp > 1. - 1e-10) fexp = 1. - 1e-10;
- if (fexp < 1e-10) fexp = 1e-10;
- f[0] = n * (1. - fexp) * (1. - fexp);
- f[1] = n * 2. * fexp * (1. - fexp);
- f[2] = n * fexp * fexp;
- for (i = 0, chi2 = 0.; i < 3; ++i)
- chi2 += (g[i] - f[i]) * (g[i] - f[i]) / f[i];
- return kf_gammaq(.5, chi2 / 2.);
-}
-
typedef struct {
double p[4];
int mq, depth, is_tested, d[4];
static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag)
{
kstring_t s;
- int is_var = (pr->p_ref < pref);
- double p_hwe, r = is_var? pr->p_ref : pr->p_var, fq;
+ int has_I16, is_var = (pr->p_ref < pref);
+ double fq, r = is_var? pr->p_ref : pr->p_var;
anno16_t a;
- p_hwe = pr->g[0] >= 0.? test_hwe(pr->g) : 1.0; // only do HWE g[] is calculated
- test16(b, &a);
+ has_I16 = test16(b, &a) >= 0? 1 : 0;
rm_info(b, "I16=");
memset(&s, 0, sizeof(kstring_t));
if (b->info[0]) kputc(';', &s);
// ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih);
ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih);
- ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
+ if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded);
if (fq < -999) fq = -999;
if (fq > 999) fq = 999;
ksprintf(&s, ";FQ=%.3g", fq);
- if (a.is_tested) {
- if (pr->pc[0] >= 0.) ksprintf(&s, ";PC4=%g,%g,%g,%g", pr->pc[0], pr->pc[1], pr->pc[2], pr->pc[3]);
- ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
+ if (pr->cmp[0] >= 0.) {
+ int i, q[3], pq;
+ for (i = 1; i < 3; ++i) {
+ double x = pr->cmp[i] + pr->cmp[0]/2.;
+ q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499);
+ if (q[i] > 255) q[i] = 255;
+ }
+ pq = (int)(-4.343 * log(pr->p_chi2) + .499);
+ if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank);
+ ksprintf(&s, ";QCHI2=%d;PCHI2=%.3g;PC2=%d,%d", pq, q[1], q[2], pr->p_chi2);
+// ksprintf(&s, ",%g,%g,%g", pr->cmp[0], pr->cmp[1], pr->cmp[2]);
}
- if (pr->g[0] >= 0. && p_hwe <= .2)
- ksprintf(&s, ";GC=%.2f,%.2f,%.2f;HWE=%.3f", pr->g[2], pr->g[1], pr->g[0], p_hwe);
+ if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
kputc('\0', &s);
kputs(b->fmt, &s); kputc('\0', &s);
free(b->str);
if (fp == 0) return 0; // fail to open file
ks = ks_init(fp);
while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
+ int l;
if (max == n) {
max = max? max<<1 : 4;
sam = realloc(sam, sizeof(void*)*max);
}
- sam[n++] = strdup(s.s);
+ l = s.l;
+ sam[n] = malloc(s.l + 2);
+ strcpy(sam[n], s.s);
+ sam[n][l+1] = 2; // by default, diploid
+ if (dret != '\n') {
+ if (ks_getuntil(ks, 0, &s, &dret) >= 0) { // read ploidy, 1 or 2
+ int x = (int)s.s[0] - '0';
+ if (x == 1 || x == 2) sam[n][l+1] = x;
+ else fprintf(stderr, "(%s) ploidy can only be 1 or 2; assume diploid\n", __func__);
+ }
+ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+ }
+ ++n;
}
ks_destroy(ks);
gzclose(fp);
if (!strstr(str.s, "##INFO=<ID=MQ,"))
kputs("##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root-mean-square mapping quality of covering reads\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=FQ,"))
- kputs("##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability that sample chromosomes are not all the same\">\n", &str);
+ kputs("##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=AF1,"))
kputs("##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the site allele frequency of the first ALT allele\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=CI95,"))
kputs("##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=INDEL,"))
kputs("##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=GT,"))
+ if (!strstr(str.s, "##INFO=<ID=PC2,"))
+ kputs("##INFO=<ID=PC2,Number=2,Type=Integer,Description=\"Phred probability of the nonRef allele frequency in group1 samples being larger (,smaller) than in group2.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=PCHI2,"))
+ kputs("##INFO=<ID=PCHI2,Number=1,Type=Float,Description=\"Posterior weighted chi^2 P-value for testing the association between group1 and group2 samples.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=QCHI2,"))
+ kputs("##INFO=<ID=QCHI2,Number=1,Type=Integer,Description=\"Phred scaled PCHI2.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=RP,"))
+ kputs("##INFO=<ID=RP,Number=1,Type=Integer,Description=\"# permutations yielding a smaller PCHI2.\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=GT,"))
kputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", &str);
if (!strstr(str.s, "##FORMAT=<ID=GQ,"))
kputs("##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">\n", &str);
extern void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x);
extern int bcf_fix_gt(bcf1_t *b);
extern int bcf_anno_max(bcf1_t *b);
+ extern int bcf_shuffle(bcf1_t *b, int seed);
bcf_t *bp, *bout = 0;
bcf1_t *b, *blast;
- int c;
+ int c, *seeds = 0;
uint64_t n_processed = 0;
viewconf_t vc;
bcf_p1aux_t *p1 = 0;
tid = begin = end = -1;
memset(&vc, 0, sizeof(viewconf_t));
- vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.;
- vc.n_sub = 0; vc.subsam = 0; vc.sublist = 0;
- while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:IMs:D:")) >= 0) {
+ vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0;
+ while ((c = getopt(argc, argv, "FN1:l:cHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:")) >= 0) {
switch (c) {
case '1': vc.n1 = atoi(optarg); break;
case 'l': vc.fn_list = strdup(optarg); break;
case 'D': vc.fn_dict = strdup(optarg); break;
+ case 'F': vc.flag |= VC_FIX_PL; break;
case 'N': vc.flag |= VC_ACGT_ONLY; break;
case 'G': vc.flag |= VC_NO_GENO; break;
case 'A': vc.flag |= VC_KEEPALT; break;
case 'c': vc.flag |= VC_CALL; break;
case 'v': vc.flag |= VC_VARONLY | VC_CALL; break;
case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break;
- case 'H': vc.flag |= VC_HWE; break;
case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break;
case 'I': vc.flag |= VC_NO_INDEL; break;
case 'M': vc.flag |= VC_ANNO_MAX; break;
case 'i': vc.indel_frac = atof(optarg); break;
case 'Q': vc.flag |= VC_QCALL; break;
case 'L': vc.flag |= VC_ADJLD; break;
- case 's': vc.subsam = read_samples(optarg, &vc.n_sub); break;
+ case 'U': vc.n_perm = atoi(optarg); break;
+ case 'X': vc.min_perm_p = atof(optarg); break;
+ case 'd': vc.min_smpl_frac = atof(optarg); break;
+ case 's': vc.subsam = read_samples(optarg, &vc.n_sub);
+ vc.ploidy = calloc(vc.n_sub + 1, 1);
+ for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1];
+ tid = -1;
+ break;
case 'P':
if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL;
else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2;
fprintf(stderr, " -S input is VCF\n");
fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n");
fprintf(stderr, " -G suppress all individual genotype information\n");
- fprintf(stderr, " -H perform Hardy-Weinberg test (slower)\n");
fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n");
fprintf(stderr, " -Q output the QCALL likelihood format\n");
fprintf(stderr, " -L calculate LD for adjacent sites\n");
fprintf(stderr, " -I skip indels\n");
+ fprintf(stderr, " -F PL generated by r921 or before\n");
+ fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n");
fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n");
- fprintf(stderr, " -1 INT number of group-1 samples [0]\n");
fprintf(stderr, " -l FILE list of sites to output [all sites]\n");
fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta);
fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac);
fprintf(stderr, " -p FLOAT variant if P(ref|D)<FLOAT [%.3g]\n", vc.pref);
fprintf(stderr, " -P STR type of prior: full, cond2, flat [full]\n");
fprintf(stderr, " -s FILE list of samples to use [all samples]\n");
+ fprintf(stderr, " -1 INT number of group-1 samples [0]\n");
+ fprintf(stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n");
+ fprintf(stderr, " -X FLOAT only perform permutations for P(chi^2)<FLOAT [%g]\n", vc.min_perm_p);
fprintf(stderr, "\n");
return 1;
}
fprintf(stderr, "[%s] For VCF->BCF conversion please specify the sequence dictionary with -D\n", __func__);
return 1;
}
+ if (vc.n1 <= 0) vc.n_perm = 0; // TODO: give a warning here!
+ if (vc.n_perm > 0) {
+ seeds = malloc(vc.n_perm * sizeof(int));
+ srand48(time(0));
+ for (c = 0; c < vc.n_perm; ++c) seeds[c] = lrand48();
+ }
b = calloc(1, sizeof(bcf1_t));
blast = calloc(1, sizeof(bcf1_t));
strcpy(moder, "r");
vcf_hdr_write(bout, hout);
}
if (vc.flag & VC_CALL) {
- p1 = bcf_p1_init(hout->n_smpl);
+ p1 = bcf_p1_init(hout->n_smpl, vc.ploidy);
if (vc.prior_file) {
if (bcf_p1_read_prior(p1, vc.prior_file) < 0) {
fprintf(stderr, "[%s] fail to read the prior AFS.\n", __func__);
}
while (vcf_read(bp, hin, b) > 0) {
int is_indel;
+ if ((vc.flag & VC_VARONLY) && strcmp(b->alt, "X") == 0) continue;
+ if ((vc.flag & VC_VARONLY) && vc.min_smpl_frac > 0.) {
+ extern int bcf_smpl_covered(const bcf1_t *b);
+ int n = bcf_smpl_covered(b);
+ if ((double)n / b->n_smpl < vc.min_smpl_frac) continue;
+ }
if (vc.n_sub) bcf_subsam(vc.n_sub, vc.sublist, b);
+ if (vc.flag & VC_FIX_PL) bcf_fix_pl(b);
is_indel = bcf_is_indel(b);
if ((vc.flag & VC_NO_INDEL) && is_indel) continue;
if ((vc.flag & VC_ACGT_ONLY) && !is_indel) {
if (vc.flag & VC_CALL) { // call variants
bcf_p1rst_t pr;
bcf_p1_cal(b, p1, &pr); // pr.g[3] is not calculated here
- if (vc.flag&VC_HWE) bcf_p1_cal_g3(p1, pr.g);
if (n_processed % 100000 == 0) {
fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed);
bcf_p1_dump_afs(p1);
}
if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue;
+ if (vc.n_perm && vc.n1 > 0 && pr.p_chi2 < vc.min_perm_p) { // permutation test
+ bcf_p1rst_t r;
+ int i, n = 0;
+ for (i = 0; i < vc.n_perm; ++i) {
+ bcf_shuffle(b, seeds[i]);
+ bcf_p1_cal(b, p1, &r);
+ if (pr.p_chi2 >= r.p_chi2) ++n;
+ }
+ pr.perm_rank = n;
+ }
update_bcf1(hout->n_smpl, b, p1, &pr, vc.pref, vc.flag);
}
if (vc.flag & VC_ADJLD) { // compute LD
if (hash) kh_destroy(set64, hash);
if (vc.fn_list) free(vc.fn_list);
if (vc.fn_dict) free(vc.fn_dict);
+ if (vc.ploidy) free(vc.ploidy);
if (vc.n_sub) {
int i;
for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]);
free(vc.subsam); free(vc.sublist);
}
+ if (seeds) free(seeds);
if (p1) bcf_p1_destroy(p1);
return 0;
}
#include <string.h>
#include <stdio.h>
#include <errno.h>
+#include <assert.h>
#include "prob1.h"
#include "kseq.h"
struct __bcf_p1aux_t {
int n, M, n1, is_indel;
+ uint8_t *ploidy; // haploid or diploid ONLY
double *q2p, *pdg; // pdg -> P(D|g)
double *phi, *phi_indel;
double *z, *zswap; // aux for afs
double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set
+ double **hg; // hypergeometric distribution
double t, t1, t2;
double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution
const uint8_t *PL; // point to PL
return 0;
}
-bcf_p1aux_t *bcf_p1_init(int n)
+bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy)
{
bcf_p1aux_t *ma;
int i;
ma = calloc(1, sizeof(bcf_p1aux_t));
ma->n1 = -1;
ma->n = n; ma->M = 2 * n;
+ if (ploidy) {
+ ma->ploidy = malloc(n);
+ memcpy(ma->ploidy, ploidy, n);
+ for (i = 0, ma->M = 0; i < n; ++i) ma->M += ploidy[i];
+ if (ma->M == 2 * n) {
+ free(ma->ploidy);
+ ma->ploidy = 0;
+ }
+ }
ma->q2p = calloc(256, sizeof(double));
ma->pdg = calloc(3 * ma->n, sizeof(double));
ma->phi = calloc(ma->M + 1, sizeof(double));
ma->phi_indel = calloc(ma->M + 1, sizeof(double));
ma->phi1 = calloc(ma->M + 1, sizeof(double));
ma->phi2 = calloc(ma->M + 1, sizeof(double));
- ma->z = calloc(2 * ma->n + 1, sizeof(double));
- ma->zswap = calloc(2 * ma->n + 1, sizeof(double));
+ ma->z = calloc(ma->M + 1, sizeof(double));
+ ma->zswap = calloc(ma->M + 1, sizeof(double));
ma->z1 = calloc(ma->M + 1, sizeof(double)); // actually we do not need this large
ma->z2 = calloc(ma->M + 1, sizeof(double));
- ma->afs = calloc(2 * ma->n + 1, sizeof(double));
- ma->afs1 = calloc(2 * ma->n + 1, sizeof(double));
+ ma->afs = calloc(ma->M + 1, sizeof(double));
+ ma->afs1 = calloc(ma->M + 1, sizeof(double));
for (i = 0; i < 256; ++i)
ma->q2p[i] = pow(10., -i / 10.);
bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior
int bcf_p1_set_n1(bcf_p1aux_t *b, int n1)
{
if (n1 == 0 || n1 >= b->n) return -1;
+ if (b->M != b->n * 2) {
+ fprintf(stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__);
+ return -1;
+ }
b->n1 = n1;
return 0;
}
void bcf_p1_destroy(bcf_p1aux_t *ma)
{
if (ma) {
- free(ma->q2p); free(ma->pdg);
+ int k;
+ if (ma->hg && ma->n1 > 0) {
+ for (k = 0; k <= 2*ma->n1; ++k) free(ma->hg[k]);
+ free(ma->hg);
+ }
+ free(ma->ploidy); free(ma->q2p); free(ma->pdg);
free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2);
free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2);
free(ma->afs); free(ma->afs1);
{
double sum, g[3];
double max, f3[3], *pdg = ma->pdg + k * 3;
- int q, i, max_i;
- f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ int q, i, max_i, ploidy;
+ ploidy = ma->ploidy? ma->ploidy[k] : 2;
+ if (ploidy == 2) {
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ } else {
+ f3[0] = 1. - f0; f3[1] = 0; f3[2] = f0;
+ }
for (i = 0, sum = 0.; i < 3; ++i)
sum += (g[i] = pdg[i] * f3[i]);
for (i = 0, max = -1., max_i = 0; i < 3; ++i) {
{
double *z[2], *tmp, *pdg;
int _j, last_min, last_max;
+ assert(beg == 0 || ma->M == ma->n*2);
z[0] = ma->z;
z[1] = ma->zswap;
pdg = ma->pdg;
z[0][0] = 1.;
last_min = last_max = 0;
ma->t = 0.;
- for (_j = beg; _j < ma->n; ++_j) {
- int k, j = _j - beg, _min = last_min, _max = last_max;
- double p[3], sum;
- pdg = ma->pdg + _j * 3;
- p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2];
- for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
- for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
- _max += 2;
- if (_min == 0)
- k = 0, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k];
- if (_min <= 1)
- k = 1, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] + k*(2*j+2-k) * p[1] * z[0][k-1];
- for (k = _min < 2? 2 : _min; k <= _max; ++k)
- z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k]
- + k*(2*j+2-k) * p[1] * z[0][k-1]
- + k*(k-1)* p[2] * z[0][k-2];
- for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
- ma->t += log(sum / ((2. * j + 2) * (2. * j + 1)));
- for (k = _min; k <= _max; ++k) z[1][k] /= sum;
- if (_min >= 1) z[1][_min-1] = 0.;
- if (_min >= 2) z[1][_min-2] = 0.;
- if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
- if (_j == ma->n1 - 1) { // set pop1
- ma->t1 = ma->t;
- memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1));
+ if (ma->M == ma->n * 2) {
+ int M = 0;
+ for (_j = beg; _j < ma->n; ++_j) {
+ int k, j = _j - beg, _min = last_min, _max = last_max, M0;
+ double p[3], sum;
+ M0 = M; M += 2;
+ pdg = ma->pdg + _j * 3;
+ p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2];
+ for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
+ for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
+ _max += 2;
+ if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k];
+ if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1];
+ for (k = _min < 2? 2 : _min; k <= _max; ++k)
+ z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / (M * (M - 1.)));
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ if (_min >= 2) z[1][_min-2] = 0.;
+ if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
+ if (_j == ma->n1 - 1) { // set pop1; ma->n1==-1 when unset
+ ma->t1 = ma->t;
+ memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1));
+ }
+ tmp = z[0]; z[0] = z[1]; z[1] = tmp;
+ last_min = _min; last_max = _max;
+ }
+ //for (_j = 0; _j < last_min; ++_j) z[0][_j] = 0.; // TODO: are these necessary?
+ //for (_j = last_max + 1; _j < ma->M; ++_j) z[0][_j] = 0.;
+ } else { // this block is very similar to the block above; these two might be merged in future
+ int j, M = 0;
+ for (j = 0; j < ma->n; ++j) {
+ int k, M0, _min = last_min, _max = last_max;
+ double p[3], sum;
+ pdg = ma->pdg + j * 3;
+ for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
+ for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
+ M0 = M;
+ M += ma->ploidy[j];
+ if (ma->ploidy[j] == 1) {
+ p[0] = pdg[0]; p[1] = pdg[2];
+ _max++;
+ if (_min == 0) k = 0, z[1][k] = (M0+1-k) * p[0] * z[0][k];
+ for (k = _min < 1? 1 : _min; k <= _max; ++k)
+ z[1][k] = (M0+1-k) * p[0] * z[0][k] + k * p[1] * z[0][k-1];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / M);
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ if (j < ma->n - 1) z[1][_max+1] = 0.;
+ } else if (ma->ploidy[j] == 2) {
+ p[0] = pdg[0]; p[1] = 2 * pdg[1]; p[2] = pdg[2];
+ _max += 2;
+ if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k];
+ if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1];
+ for (k = _min < 2? 2 : _min; k <= _max; ++k)
+ z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / (M * (M - 1.)));
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ if (_min >= 2) z[1][_min-2] = 0.;
+ if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
+ }
+ tmp = z[0]; z[0] = z[1]; z[1] = tmp;
+ last_min = _min; last_max = _max;
}
- tmp = z[0]; z[0] = z[1]; z[1] = tmp;
- last_min = _min; last_max = _max;
}
if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1));
}
static void mc_cal_y(bcf_p1aux_t *ma)
{
- if (ma->n1 > 0 && ma->n1 < ma->n) {
+ if (ma->n1 > 0 && ma->n1 < ma->n && ma->M == ma->n * 2) { // NB: ma->n1 is ineffective when there are haploid samples
int k;
long double x;
memset(ma->z1, 0, sizeof(double) * (2 * ma->n1 + 1));
} else mc_cal_y_core(ma, 0);
}
-static void contrast(bcf_p1aux_t *ma, double pc[4]) // mc_cal_y() must be called before hand
+#define CONTRAST_TINY 1e-30
+
+extern double kf_gammaq(double s, double z); // incomplete gamma function for chi^2 test
+
+static inline double chi2_test(int a, int b, int c, int d)
+{
+ double x, z;
+ x = (double)(a+b) * (c+d) * (b+d) * (a+c);
+ if (x == 0.) return 1;
+ z = a * d - b * c;
+ return kf_gammaq(.5, .5 * z * z * (a+b+c+d) / x);
+}
+
+// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)]
+static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int n1, int n2, int k1, int k2, double x[3])
+{
+ double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2];
+ if (p < CONTRAST_TINY) return -1;
+ if (.5*k1/n1 < .5*k2/n2) x[1] += p;
+ else if (.5*k1/n1 > .5*k2/n2) x[2] += p;
+ else x[0] += p;
+ return p * chi2_test(k1, k2, (n1<<1) - k1, (n2<<1) - k2);
+}
+
+static double contrast2(bcf_p1aux_t *p1, double ret[3])
{
- int k, n1 = ma->n1, n2 = ma->n - ma->n1;
- long double sum1, sum2;
- pc[0] = pc[1] = pc[2] = pc[3] = -1.;
- if (n1 <= 0 || n2 <= 0) return;
- for (k = 0, sum1 = 0.; k <= 2*n1; ++k) sum1 += ma->phi1[k] * ma->z1[k];
- for (k = 0, sum2 = 0.; k <= 2*n2; ++k) sum2 += ma->phi2[k] * ma->z2[k];
- pc[2] = ma->phi1[2*n1] * ma->z1[2*n1] / sum1;
- pc[3] = ma->phi2[2*n2] * ma->z2[2*n2] / sum2;
- for (k = 2; k < 4; ++k) {
- pc[k] = pc[k] > .5? -(-4.343 * log(1. - pc[k] + TINY) + .499) : -4.343 * log(pc[k] + TINY) + .499;
- pc[k] = (int)pc[k];
- if (pc[k] > 99) pc[k] = 99;
- if (pc[k] < -99) pc[k] = -99;
+ int k, k1, k2, k10, k20, n1, n2;
+ double sum;
+ // get n1 and n2
+ n1 = p1->n1; n2 = p1->n - p1->n1;
+ if (n1 <= 0 || n2 <= 0) return 0.;
+ if (p1->hg == 0) { // initialize the hypergeometric distribution
+ /* NB: the hg matrix may take a lot of memory when there are many samples. There is a way
+ to avoid precomputing this matrix, but it is slower and quite intricate. The following
+ computation in this block can be accelerated with a similar strategy, but perhaps this
+ is not a serious concern for now. */
+ double tmp = lgamma(2*(n1+n2)+1) - (lgamma(2*n1+1) + lgamma(2*n2+1));
+ p1->hg = calloc(2*n1+1, sizeof(void*));
+ for (k1 = 0; k1 <= 2*n1; ++k1) {
+ p1->hg[k1] = calloc(2*n2+1, sizeof(double));
+ for (k2 = 0; k2 <= 2*n2; ++k2)
+ p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp));
+ }
+ }
+ { // compute sum1 and sum2
+ long double suml = 0;
+ for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k];
+ sum = suml;
+ }
+ { // get the mean k1 and k2
+ double max;
+ int max_k;
+ for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) {
+ double x = p1->phi1[k] * p1->z1[k];
+ if (x > max) max = x, max_k = k;
+ }
+ k10 = max_k;
+ for (k = 0, max = 0, max_k = -1; k <= 2*n2; ++k) {
+ double x = p1->phi2[k] * p1->z2[k];
+ if (x > max) max = x, max_k = k;
+ }
+ k20 = max_k;
+ }
+ { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N.
+ double x[3], y;
+ long double z = 0.;
+ x[0] = x[1] = x[2] = 0;
+ for (k1 = k10; k1 >= 0; --k1) {
+ for (k2 = k20; k2 >= 0; --k2) {
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ }
+ ret[0] = x[0]; ret[1] = x[1]; ret[2] = x[2];
+ x[0] = x[1] = x[2] = 0;
+ for (k1 = k10 + 1; k1 <= 2*n1; ++k1) {
+ for (k2 = k20; k2 >= 0; --k2) {
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ }
+ ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2];
+ if (ret[0] + ret[1] + ret[2] < 0.99) { // in case of bad things happened
+ ret[0] = ret[1] = ret[2] = 0;
+ for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1)
+ for (k2 = 0; k2 <= 2*n2; ++k2)
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, ret)) >= 0) z += y;
+ }
+ return (double)z;
}
- pc[0] = ma->phi2[2*n2] * ma->z2[2*n2] / sum2 * (1. - ma->phi1[2*n1] * ma->z1[2*n1] / sum1);
- pc[1] = ma->phi1[2*n1] * ma->z1[2*n1] / sum1 * (1. - ma->phi2[2*n2] * ma->z2[2*n2] / sum2);
- pc[0] = pc[0] == 1.? 99 : (int)(-4.343 * log(1. - pc[0]) + .499);
- pc[1] = pc[1] == 1.? 99 : (int)(-4.343 * log(1. - pc[1]) + .499);
}
static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded)
return sum / ma->M;
}
-long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3])
-{
- long double pd = 0., g2[3];
- int i, k;
- memset(g2, 0, sizeof(long double) * 3);
- for (k = 0; k < p1a->M; ++k) {
- double f = (double)k / p1a->M, f3[3], g1[3];
- long double z = 1.;
- g1[0] = g1[1] = g1[2] = 0.;
- f3[0] = (1. - f) * (1. - f); f3[1] = 2. * f * (1. - f); f3[2] = f * f;
- for (i = 0; i < p1a->n; ++i) {
- double *pdg = p1a->pdg + i * 3;
- double x = pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2];
- z *= x;
- g1[0] += pdg[0] * f3[0] / x;
- g1[1] += pdg[1] * f3[1] / x;
- g1[2] += pdg[2] * f3[2] / x;
- }
- pd += p1a->phi[k] * z;
- for (i = 0; i < 3; ++i)
- g2[i] += p1a->phi[k] * z * g1[i];
- }
- for (i = 0; i < 3; ++i) g[i] = g2[i] / pd;
- return pd;
-}
-
int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
{
int i, k;
long double sum = 0.;
ma->is_indel = bcf_is_indel(b);
+ rst->perm_rank = -1;
// set PL and PL_len
for (i = 0; i < b->n_gi; ++i) {
if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M;
}
rst->g[0] = rst->g[1] = rst->g[2] = -1.;
- contrast(ma, rst->pc);
+ rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0;
+ if (rst->p_var > 0.1) // skip contrast2() if the locus is a strong non-variant
+ rst->p_chi2 = contrast2(ma, rst->cmp);
return 0;
}
typedef struct __bcf_p1aux_t bcf_p1aux_t;
typedef struct {
- int rank0;
+ int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal()
double f_em, f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var;
double cil, cih;
- double pc[4];
+ double cmp[3], p_chi2; // used by contrast2()
double g[3];
} bcf_p1rst_t;
extern "C" {
#endif
- bcf_p1aux_t *bcf_p1_init(int n);
+ bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy);
void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta);
void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta);
void bcf_p1_destroy(bcf_p1aux_t *ma);
int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k);
void bcf_p1_dump_afs(bcf_p1aux_t *ma);
int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn);
- long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3]);
int bcf_p1_set_n1(bcf_p1aux_t *b, int n1);
void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called
static
BGZF*
-open_write(int fd, bool is_uncompressed)
+open_write(int fd, int compress_level) // compress_level==-1 for the default level
{
FILE* file = fdopen(fd, "w");
BGZF* fp;
fp = malloc(sizeof(BGZF));
fp->file_descriptor = fd;
fp->open_mode = 'w';
- fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
+ fp->owned_file = 0;
+ fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
+ if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
#ifdef _USE_KNETFILE
fp->x.fpw = file;
#else
fp = open_read(fd);
#endif
} else if (strchr(mode, 'w') || strchr(mode, 'W')) {
- int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC;
+ int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC;
#ifdef _WIN32
oflag |= O_BINARY;
#endif
fd = open(path, oflag, 0666);
if (fd == -1) return 0;
- fp = open_write(fd, strchr(mode, 'u')? 1 : 0);
+ { // set compress_level
+ int i;
+ for (i = 0; mode[i]; ++i)
+ if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = (int)mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ }
+ fp = open_write(fd, compress_level);
}
if (fp != NULL) fp->owned_file = 1;
return fp;
if (mode[0] == 'r' || mode[0] == 'R') {
return open_read(fd);
} else if (mode[0] == 'w' || mode[0] == 'W') {
- return open_write(fd, strstr(mode, "u")? 1 : 0);
+ int i, compress_level = -1;
+ for (i = 0; mode[i]; ++i)
+ if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = (int)mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ return open_write(fd, compress_level);
} else {
return NULL;
}
int input_length = block_length;
int compressed_length = 0;
while (1) {
- int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
z_stream zs;
zs.zalloc = NULL;
zs.zfree = NULL;
zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
- int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
+ int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED,
GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
if (status != Z_OK) {
report_error(fp, "deflate init failed");
// Inflate the block in fp->compressed_block into fp->uncompressed_block
z_stream zs;
+ int status;
zs.zalloc = NULL;
zs.zfree = NULL;
zs.next_in = fp->compressed_block + 18;
zs.next_out = fp->uncompressed_block;
zs.avail_out = fp->uncompressed_block_size;
- int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+ status = inflateInit2(&zs, GZIP_WINDOW_BITS);
if (status != Z_OK) {
report_error(fp, "inflate init failed");
return -1;
bgzf_read_block(BGZF* fp)
{
bgzf_byte_t header[BLOCK_HEADER_LENGTH];
- int count, size = 0;
+ int count, size = 0, block_length, remaining;
#ifdef _USE_KNETFILE
int64_t block_address = knet_tell(fp->x.fpr);
if (load_block_from_cache(fp, block_address)) return 0;
report_error(fp, "invalid block header");
return -1;
}
- int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
+ block_length = unpackInt16((uint8_t*)&header[16]) + 1;
bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
- int remaining = block_length - BLOCK_HEADER_LENGTH;
+ remaining = block_length - BLOCK_HEADER_LENGTH;
#ifdef _USE_KNETFILE
count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
#else
int bytes_read = 0;
bgzf_byte_t* output = data;
while (bytes_read < length) {
- int available = fp->block_length - fp->block_offset;
+ int copy_length, available = fp->block_length - fp->block_offset;
+ bgzf_byte_t *buffer;
if (available <= 0) {
if (bgzf_read_block(fp) != 0) {
return -1;
break;
}
}
- int copy_length = bgzf_min(length-bytes_read, available);
- bgzf_byte_t* buffer = fp->uncompressed_block;
+ copy_length = bgzf_min(length-bytes_read, available);
+ buffer = fp->uncompressed_block;
memcpy(output, buffer + fp->block_offset, copy_length);
fp->block_offset += copy_length;
output += copy_length;
int bgzf_write(BGZF* fp, const void* data, int length)
{
+ const bgzf_byte_t *input = data;
+ int block_length, bytes_written;
if (fp->open_mode != 'w') {
report_error(fp, "file not open for writing");
return -1;
if (fp->uncompressed_block == NULL)
fp->uncompressed_block = malloc(fp->uncompressed_block_size);
- const bgzf_byte_t* input = data;
- int block_length = fp->uncompressed_block_size;
- int bytes_written = 0;
+ input = data;
+ block_length = fp->uncompressed_block_size;
+ bytes_written = 0;
while (bytes_written < length) {
int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
bgzf_byte_t* buffer = fp->uncompressed_block;
#include <stdint.h>
#include <stdio.h>
-#include <stdbool.h>
#include <zlib.h>
#ifdef _USE_KNETFILE
#include "knetfile.h"
typedef struct {
int file_descriptor;
char open_mode; // 'r' or 'w'
- bool owned_file, is_uncompressed;
+ int16_t owned_file, compress_level;
#ifdef _USE_KNETFILE
union {
knetFile *fpr;
-#!/usr/bin/env perl
-#
-#
-# Script to convert GERALD export files to SAM format.
-#
-#
-#
-########## License:
-#
-# The MIT License
-#
-# Original SAMtools version 0.1.2 copyright (c) 2008-2009 Genome Research Ltd.
-# Modifications from version 0.1.2 to 2.0.0 copyright (c) 2010 Illumina, Inc.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-#
-#
-#
-########## ChangeLog:
-#
-# Version: 2.0.0 (15FEB2010)
-# Script updated by Illumina in conjunction with CASAVA 1.7.0 release.
-# Major changes are as follows:
-# - The CIGAR string has been updated to include all gaps from ELANDv2 alignments.
-# - The ELAND single read alignment score is always stored in the optional "SM" field
-# and the ELAND paired read alignment score is stored in the optional "AS" field
-# when it exists.
-# - The MAPQ value is set to the higher of the two alignment scores, but no greater
-# than 254, i.e. min(254,max(SM,AS))
-# - The SAM "proper pair" bit (0x0002) is now set for read pairs meeting ELAND's
-# expected orientation and insert size criteria.
-# - The default quality score translation is set for export files which contain
-# Phread+64 quality values. An option, "--qlogodds", has been added to
-# translate quality values from the Solexa+64 format used in export files prior
-# to Pipeline 1.3
-# - The export match descriptor is now reverse-complemented when necessary such that
-# it always corresponds to the forward strand of the reference, to be consistent
-# with other information in the SAM record. It is now written to the optional
-# 'XD' field (rather than 'MD') to acknowledge its minor differences from the
-# samtools match descriptor (see additional detail below).
-# - An option, "--nofilter", has been added to include reads which have failed
-# primary analysis quality filtration. Such reads will have the corresponding
-# SAM flag bit (0x0200) set.
-# - Labels in the export 'contig' field are preserved by setting RNAME to
-# "$export_chromosome/$export_contig" when then contig label exists.
-#
-#
-# Contact: lh3
-# Version: 0.1.2 (03JAN2009)
-#
-#
-#
-########## Known Conversion Limitations:
-#
-# - Export records for reads that map to a position < 1 (allowed in export format), are converted
-# to unmapped reads in the SAM record.
-# - Export records contain the reserved chromosome names: "NM" and "QC". "NM" indicates that the
-# aligner could not map the read to the reference sequence set, and "QC" means that the
-# aligner did not attempt to map the read due to some technical limitation. Both of these
-# alignment types are collapsed to the single unmapped alignment state in the SAM record.
-# - The export match descriptor is slightly different than the samtools match descriptor. For
-# this reason it is stored in the optional SAM field 'XD' (and not 'MD'). Note that the
-# export match descriptor differs from the samtools version in two respects: (1) indels
-# are explicitly closed with the '$' character and (2) insertions must be enumerated in
-# the match descriptor. For example a 35-base read with a two-base insertion is described
-# as: 20^2$14
-#
-#
-#
-
-my $version = "2.0.0";
-
-use strict;
-use warnings;
-
-use File::Spec qw(splitpath);
-use Getopt::Long;
-use List::Util qw(min max);
-
-
-use constant {
- EXPORT_INDEX => 6,
- EXPORT_READNO => 7,
- EXPORT_READ => 8,
- EXPORT_QUAL => 9,
- EXPORT_CHROM => 10,
- EXPORT_CONTIG => 11,
- EXPORT_POS => 12,
- EXPORT_STRAND => 13,
- EXPORT_MD => 14,
- EXPORT_SEMAP => 15,
- EXPORT_PEMAP => 16,
- EXPORT_PASSFILT => 21,
-};
-
-
-use constant {
- SAM_QNAME => 0,
- SAM_FLAG => 1,
- SAM_RNAME => 2,
- SAM_POS => 3,
- SAM_MAPQ => 4,
- SAM_CIGAR => 5,
- SAM_MRNM => 6,
- SAM_MPOS => 7,
- SAM_ISIZE => 8,
- SAM_SEQ => 9,
- SAM_QUAL => 10,
-};
-
-
-# function prototypes for Richard's code
-sub match_desc_to_cigar($);
-sub match_desc_frag_length($);
-sub reverse_compl_match_descriptor($);
-sub write_header($;$;$);
-
-
-&export2sam;
-exit;
-
-
-
-
-sub export2sam {
-
- my $cmdline = $0 . " " . join(" ",@ARGV);
- my $arg_count = scalar @ARGV;
- my @spval = File::Spec->splitpath($0);
- my $progname = $spval[2];
-
- my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values
- my $is_nofilter = 0;
- my $read1file;
- my $read2file;
- my $print_version = 0;
- my $help = 0;
-
- my $result = GetOptions( "qlogodds" => \$is_logodds_qvals,
- "nofilter" => \$is_nofilter,
- "read1=s" => \$read1file,
- "read2=s" => \$read2file,
- "version" => \$print_version,
- "help" => \$help );
-
- my $usage = <<END;
-
-$progname converts GERALD export files to SAM format.
-
-Usage: $progname --read1=FILENAME [ options ] | --version | --help
-
- --read1=FILENAME read1 export file (mandatory)
- --read2=FILENAME read2 export file
- --nofilter include reads that failed the pipeline/RTA purity filter
- --qlogodds assume export file(s) use logodds quality values as reported
- by pipeline prior to v1.3 (default: phred quality values)
-
-END
-
- my $version_msg = <<END;
-
-$progname version: $version
-
-END
-
- if((not $result) or $help or ($arg_count==0)) {
- die($usage);
- }
-
- if(@ARGV) {
- print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n";
- die($usage);
- }
-
- if($print_version) {
- die($version_msg);
- }
-
- if(not defined($read1file)) {
- print STDERR "\nERROR: read1 export file must be specified\n\n";
- die($usage);
- }
-
- my ($fh1, $fh2, $is_paired);
-
- open($fh1, $read1file) or die("\nERROR: Can't open read1 export file: $read1file\n\n");
- $is_paired = defined $read2file;
- if ($is_paired) {
- open($fh2, $read2file) or die("\nERROR: Can't open read2 export file: $read2file\n\n");
- }
- # quality value conversion table
- my @conv_table;
- if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3):
- for (-64..64) {
- $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499);
- }
- } else { # convert from phred+64 quality values (pipeline v1.3+):
- for (-64..-1) {
- $conv_table[$_+64] = undef;
- }
- for (0..64) {
- $conv_table[$_+64] = int(33 + $_);
- }
- }
- # write the header
- print write_header( $progname, $version, $cmdline );
- # core loop
- my $export_line_count = 0;
- while (<$fh1>) {
- $export_line_count++;
- my (@s1, @s2);
- &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter);
- if ($is_paired) {
- my $read2line = <$fh2>;
- if(not $read2line){
- die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n");
- }
- &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter);
-
- if (@s1 && @s2) { # then set mate coordinate
- if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){
- die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n");
- }
-
- my $isize = 0;
- if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize
- my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS];
- my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS];
- $isize = $x2 - $x1;
- }
-
- foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){
- my ($sa,$sb,$is) = @{$_};
- if ($sb->[SAM_RNAME] ne '*') {
- $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME];
- $sa->[SAM_MPOS] = $sb->[SAM_POS];
- $sa->[SAM_ISIZE] = $is;
- $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10);
- } else {
- $sa->[SAM_FLAG] |= 0x8;
- }
- }
- }
- }
- print join("\t", @s1), "\n" if (@s1);
- print join("\t", @s2), "\n" if (@s2 && $is_paired);
- }
- close($fh1);
- if($is_paired) {
- while(my $read2line = <$fh2>){
- $export_line_count++;
- die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n");
- }
- close($fh2);
- }
-}
-
-sub export2sam_aux {
- my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_;
- chomp($line);
- my @t = split("\t", $line);
- @$s = ();
- my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y');
- return if(not ($isPassFilt or $is_nofilter));
- # read name
- $s->[SAM_QNAME] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]";
- # initial flag (will be updated later)
- $s->[SAM_FLAG] = 0;
- if($is_paired) {
- if($t[EXPORT_READNO] != $read_no){
- die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n");
- }
- $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no);
- }
- $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt);
-
- # read & quality
- my $is_export_rev = ($t[EXPORT_STRAND] eq 'R');
- if ($is_export_rev) { # then reverse the sequence and quality
- $s->[SAM_SEQ] = reverse($t[EXPORT_READ]);
- $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/;
- $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]);
- } else {
- $s->[SAM_SEQ] = $t[EXPORT_READ];
- $s->[SAM_QUAL] = $t[EXPORT_QUAL];
- }
- my @convqual = ();
- foreach (unpack('C*', $s->[SAM_QUAL])){
- my $val=$ct->[$_];
- if(not defined $val){
- my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n";
- if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; }
- die($msg . "\n");
- }
- push @convqual,$val;
- }
-
- $s->[SAM_QUAL] = pack('C*',@convqual); # change coding
-
-
- # coor
- my $has_coor = 0;
- $s->[SAM_RNAME] = "*";
- if ($t[EXPORT_CHROM] eq 'NM' or $t[EXPORT_CHROM] eq 'QC') {
- $s->[SAM_FLAG] |= 0x4; # unmapped
- } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) {
- $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?
- push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")
- } elsif ($t[EXPORT_POS] < 1) {
- $s->[SAM_FLAG] |= 0x4; # unmapped
- } else {
- $s->[SAM_RNAME] = $t[EXPORT_CHROM];
- $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne '');
- $has_coor = 1;
- }
- $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0;
-
-# print STDERR "t[14] = " . $t[14] . "\n";
- my $matchDesc = '';
- $s->[SAM_CIGAR] = "*";
- if($has_coor){
- $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD];
-
- if($matchDesc =~ /\^/){
- # construct CIGAR string using Richard's function
- $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing
- } else {
- $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M";
- }
- }
-
-# print STDERR "cigar_string = $cigar_string\n";
-
- $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev);
- if($has_coor){
- my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0;
- my $pemap = 0;
- if($is_paired) {
- $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0;
-
- # set `proper pair' bit if non-blank, non-zero PE alignment score:
- $s->[SAM_FLAG] |= 0x02 if ($pemap > 0);
- }
- $s->[SAM_MAPQ] = min(254,max($semap,$pemap));
- } else {
- $s->[SAM_MAPQ] = 0;
- }
- # mate coordinate
- $s->[SAM_MRNM] = '*';
- $s->[SAM_MPOS] = 0;
- $s->[SAM_ISIZE] = 0;
- # aux
- push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]);
- if($has_coor){
- # The export match descriptor differs slightly from the samtools match descriptor.
- # In order for the converted SAM files to be as compliant as possible,
- # we put the export match descriptor in optional field 'XD' rather than 'MD':
- push(@$s, "XD:Z:$matchDesc");
- push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne '');
- push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne ''));
- }
-}
-
-
-
-#
-# the following code is taken from Richard Shaw's sorted2sam.pl file
-#
-sub reverse_compl_match_descriptor($)
-{
-# print "\nREVERSING THE MATCH DESCRIPTOR!\n";
- my ($match_desc) = @_;
- my $rev_compl_match_desc = reverse($match_desc);
- $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/;
-
- # Unreverse the digits of numbers.
- $rev_compl_match_desc = join('',
- map {($_ =~ /\d+/)
- ? join('', reverse(split('', $_)))
- : $_} split(/(\d+)/,
- $rev_compl_match_desc));
-
- return $rev_compl_match_desc;
-}
-
-
-
-sub match_desc_to_cigar($)
-{
- my ($match_desc) = @_;
-
- my @match_desc_parts = split(/(\^.*?\$)/, $match_desc);
- my $cigar_str = '';
- my $cigar_del_ch = 'D';
- my $cigar_ins_ch = 'I';
- my $cigar_match_ch = 'M';
-
- foreach my $match_desc_part (@match_desc_parts) {
- next if (!$match_desc_part);
-
- if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) {
- # Deletion
- $cigar_str .= (length($1) . $cigar_del_ch);
- } elsif ($match_desc_part =~ /^\^(\d+)\$$/) {
- # Insertion
- $cigar_str .= ($1 . $cigar_ins_ch);
- } else {
- $cigar_str .= (match_desc_frag_length($match_desc_part)
- . $cigar_match_ch);
- }
- }
-
- return $cigar_str;
-}
-
-
-#------------------------------------------------------------------------------
-
-sub match_desc_frag_length($)
- {
- my ($match_desc_str) = @_;
- my $len = 0;
-
- my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str);
-
- foreach my $match_desc_field (@match_desc_fields) {
- next if ($match_desc_field eq '');
-
- $len += (($match_desc_field =~ /(\d+)/)
- ? $1 : length($match_desc_field));
- }
-
- return $len;
-}
-
-
-# argument holds the command line
-sub write_header($;$;$)
-{
- my ($progname,$version,$cl) = @_;
- my $complete_header = "";
- $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n";
-
- return $complete_header;
-}
+#!/usr/bin/env perl\r
+#\r
+#\r
+# export2sam.pl converts GERALD export files to SAM format.\r
+#\r
+#\r
+#\r
+########## License:\r
+#\r
+# The MIT License\r
+#\r
+# Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.\r
+# Modified SAMtools work copyright (c) 2010 Illumina, Inc.\r
+#\r
+# Permission is hereby granted, free of charge, to any person obtaining a copy\r
+# of this software and associated documentation files (the "Software"), to deal\r
+# in the Software without restriction, including without limitation the rights\r
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\r
+# copies of the Software, and to permit persons to whom the Software is\r
+# furnished to do so, subject to the following conditions:\r
+#\r
+# The above copyright notice and this permission notice shall be included in\r
+# all copies or substantial portions of the Software.\r
+# \r
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\r
+# THE SOFTWARE.\r
+#\r
+#\r
+#\r
+#\r
+########## ChangeLog:\r
+#\r
+# Version: 2.3.1 (18MAR2011)\r
+#\r
+# - Restore file '-' as stdin input.\r
+#\r
+# Version: 2.3.0 (24JAN2011)\r
+#\r
+# - Add support for export reserved chromosome name "CONTROL",\r
+# which is translated to optional field "XC:Z:CONTROL".\r
+# - Check for ".gz" file extension on export files and open\r
+# these as gzip pipes when the extension is found.\r
+#\r
+# Version: 2.2.0 (16NOV2010)\r
+#\r
+# - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y\r
+# - For export records with reserved chromosome name identifiers\r
+# "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM"\r
+# to the SAM record, so that these cases can be distinguished\r
+# from other unmatched reads.\r
+#\r
+# Version: 2.1.0 (21SEP2010)\r
+#\r
+# - Additional export record error checking.\r
+# - Convert export records with chromomsome value of "RM" to unmapped \r
+# SAM records.\r
+#\r
+# Version: 2.0.0 (15FEB2010)\r
+#\r
+# Script updated by Illumina in conjunction with CASAVA 1.7.0\r
+# release.\r
+#\r
+# Major changes are as follows:\r
+# - The CIGAR string has been updated to include all gaps from\r
+# ELANDv2 alignments.\r
+# - The ELAND single read alignment score is always stored in the\r
+# optional "SM" field and the ELAND paired read alignment score\r
+# is stored in the optional "AS" field when it exists.\r
+# - The MAPQ value is set to the higher of the two alignment scores,\r
+# but no greater than 254, i.e. min(254,max(SM,AS))\r
+# - The SAM "proper pair" bit (0x0002) is now set for read pairs\r
+# meeting ELAND's expected orientation and insert size criteria.\r
+# - The default quality score translation is set for export files\r
+# which contain Phread+64 quality values. An option,\r
+# "--qlogodds", has been added to translate quality values from\r
+# the Solexa+64 format used in export files prior to Pipeline\r
+# 1.3\r
+# - The export match descriptor is now reverse-complemented when\r
+# necessary such that it always corresponds to the forward\r
+# strand of the reference, to be consistent with other\r
+# information in the SAM record. It is now written to the\r
+# optional 'XD' field (rather than 'MD') to acknowledge its\r
+# minor differences from the samtools match descriptor (see\r
+# additional detail below).\r
+# - An option, "--nofilter", has been added to include reads which\r
+# have failed primary analysis quality filtration. Such reads\r
+# will have the corresponding SAM flag bit (0x0200) set.\r
+# - Labels in the export 'contig' field are preserved by setting\r
+# RNAME to "$export_chromosome/$export_contig" when the contig\r
+# label exists.\r
+#\r
+#\r
+# Contact: lh3\r
+# Version: 0.1.2 (03JAN2009)\r
+#\r
+#\r
+#\r
+########## Known Conversion Limitations:\r
+#\r
+# - Export records for reads that map to a position < 1 (allowed\r
+# in export format), are converted to unmapped reads in the SAM\r
+# record.\r
+# - Export records contain the reserved chromosome names: "NM",\r
+# "QC","RM" and "CONTROL". "NM" indicates that the aligner could\r
+# not map the read to the reference sequence set. "QC" means that\r
+# the aligner did not attempt to map the read due to some\r
+# technical limitation. "RM" means that the read mapped to a set\r
+# of 'contaminant' sequences specified in GERALD's RNA-seq\r
+# workflow. "CONTROL" means that the read is a control. All of\r
+# these alignment types are collapsed to the single unmapped\r
+# alignment state in the SAM record, but the optional SAM "XC"\r
+# field is used to record the original reserved chromosome name of\r
+# the read for all but the "NM" case.\r
+# - The export match descriptor is slightly different than the\r
+# samtools match descriptor. For this reason it is stored in the\r
+# optional SAM field 'XD' (and not 'MD'). Note that the export\r
+# match descriptor differs from the samtools version in two\r
+# respects: (1) indels are explicitly closed with the '$'\r
+# character and (2) insertions must be enumerated in the match\r
+# descriptor. For example a 35-base read with a two-base insertion\r
+# is described as: 20^2$14\r
+#\r
+#\r
+#\r
+\r
+my $version = "2.3.1";\r
+\r
+use strict;\r
+use warnings;\r
+\r
+use Getopt::Long;\r
+use File::Spec;\r
+use List::Util qw(min max);\r
+\r
+\r
+use constant {\r
+ EXPORT_MACHINE => 0,\r
+ EXPORT_RUNNO => 1,\r
+ EXPORT_LANE => 2,\r
+ EXPORT_TILE => 3,\r
+ EXPORT_X => 4,\r
+ EXPORT_Y => 5,\r
+ EXPORT_INDEX => 6,\r
+ EXPORT_READNO => 7,\r
+ EXPORT_READ => 8,\r
+ EXPORT_QUAL => 9,\r
+ EXPORT_CHROM => 10,\r
+ EXPORT_CONTIG => 11,\r
+ EXPORT_POS => 12,\r
+ EXPORT_STRAND => 13, \r
+ EXPORT_MD => 14,\r
+ EXPORT_SEMAP => 15,\r
+ EXPORT_PEMAP => 16,\r
+ EXPORT_PASSFILT => 21,\r
+ EXPORT_SIZE => 22,\r
+};\r
+\r
+\r
+use constant {\r
+ SAM_QNAME => 0,\r
+ SAM_FLAG => 1,\r
+ SAM_RNAME => 2,\r
+ SAM_POS => 3,\r
+ SAM_MAPQ => 4,\r
+ SAM_CIGAR => 5,\r
+ SAM_MRNM => 6,\r
+ SAM_MPOS => 7,\r
+ SAM_ISIZE => 8,\r
+ SAM_SEQ => 9,\r
+ SAM_QUAL => 10,\r
+};\r
+\r
+\r
+# function prototypes for Richard's code\r
+sub match_desc_to_cigar($);\r
+sub match_desc_frag_length($);\r
+sub reverse_compl_match_descriptor($);\r
+sub write_header($;$;$);\r
+\r
+\r
+&export2sam;\r
+exit;\r
+\r
+\r
+\r
+\r
+sub export2sam {\r
+\r
+ my $cmdline = $0 . " " . join(" ",@ARGV);\r
+ my $arg_count = scalar @ARGV;\r
+ my $progname = (File::Spec->splitpath($0))[2];\r
+\r
+ my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values\r
+ my $is_nofilter = 0;\r
+ my $read1file;\r
+ my $read2file;\r
+ my $print_version = 0;\r
+ my $help = 0;\r
+\r
+ my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, \r
+ "nofilter" => \$is_nofilter,\r
+ "read1=s" => \$read1file,\r
+ "read2=s" => \$read2file,\r
+ "version" => \$print_version,\r
+ "help" => \$help );\r
+\r
+ my $usage = <<END;\r
+\r
+$progname converts GERALD export files to SAM format.\r
+\r
+Usage: $progname --read1=FILENAME [ options ] | --version | --help\r
+\r
+ --read1=FILENAME read1 export file or '-' for stdin (mandatory)\r
+ (file may be gzipped with ".gz" extension)\r
+ --read2=FILENAME read2 export file or '-' for stdin\r
+ (file may be gzipped with ".gz" extension)\r
+ --nofilter include reads that failed the basecaller\r
+ purity filter\r
+ --qlogodds assume export file(s) use logodds quality values\r
+ as reported by OLB (Pipeline) prior to v1.3\r
+ (default: phred quality values)\r
+\r
+END\r
+\r
+ my $version_msg = <<END;\r
+\r
+$progname version: $version\r
+\r
+END\r
+\r
+ if((not $result) or $help or ($arg_count==0)) {\r
+ die($usage);\r
+ }\r
+\r
+ if(@ARGV) {\r
+ print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n";\r
+ die($usage);\r
+ }\r
+\r
+ if($print_version) {\r
+ die($version_msg);\r
+ }\r
+\r
+ if(not defined($read1file)) {\r
+ print STDERR "\nERROR: read1 export file must be specified\n\n";\r
+ die($usage);\r
+ }\r
+\r
+ unless((-f $read1file) or ($read1file eq '-')) {\r
+ die("\nERROR: Can't find read1 export file: '$read1file'\n\n");\r
+ }\r
+\r
+ if (defined $read2file) {\r
+ unless((-f $read2file) or ($read2file eq '-')) {\r
+ die("\nERROR: Can't find read2 export file: '$read2file'\n\n");\r
+ }\r
+ if($read1file eq $read2file) {\r
+ die("\nERROR: read1 and read2 export filenames are the same: '$read1file'\n\n");\r
+ }\r
+ }\r
+\r
+ my ($fh1, $fh2, $is_paired);\r
+\r
+ my $read1cmd="$read1file";\r
+ $read1cmd = "gzip -dc $read1file |" if($read1file =~ /\.gz$/);\r
+ open($fh1, $read1cmd)\r
+ or die("\nERROR: Can't open read1 process: '$read1cmd'\n\n");\r
+ $is_paired = defined $read2file;\r
+ if ($is_paired) {\r
+ my $read2cmd="$read2file";\r
+ $read2cmd = "gzip -dc $read2file |" if($read2file =~ /\.gz$/);\r
+ open($fh2, $read2cmd)\r
+ or die("\nERROR: Can't open read2 process: '$read2cmd'\n\n");\r
+ }\r
+ # quality value conversion table\r
+ my @conv_table;\r
+ if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3):\r
+ for (-64..64) {\r
+ $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499);\r
+ }\r
+ } else { # convert from phred+64 quality values (pipeline v1.3+):\r
+ for (-64..-1) {\r
+ $conv_table[$_+64] = undef;\r
+ }\r
+ for (0..64) {\r
+ $conv_table[$_+64] = int(33 + $_);\r
+ }\r
+ }\r
+ # write the header\r
+ print write_header( $progname, $version, $cmdline );\r
+ # core loop\r
+ my $export_line_count = 0;\r
+ while (<$fh1>) {\r
+ $export_line_count++;\r
+ my (@s1, @s2);\r
+ &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter);\r
+ if ($is_paired) {\r
+ my $read2line = <$fh2>;\r
+ if(not $read2line){\r
+ die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n");\r
+ }\r
+ &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter);\r
+\r
+ if (@s1 && @s2) { # then set mate coordinate\r
+ if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){\r
+ die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n");\r
+ }\r
+\r
+ my $isize = 0;\r
+ if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize\r
+ my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS];\r
+ my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS];\r
+ $isize = $x2 - $x1;\r
+ }\r
+\r
+ foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ \r
+ my ($sa,$sb,$is) = @{$_};\r
+ if ($sb->[SAM_RNAME] ne '*') {\r
+ $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME];\r
+ $sa->[SAM_MPOS] = $sb->[SAM_POS];\r
+ $sa->[SAM_ISIZE] = $is;\r
+ $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10);\r
+ } else {\r
+ $sa->[SAM_FLAG] |= 0x8;\r
+ }\r
+ } \r
+ }\r
+ }\r
+ print join("\t", @s1), "\n" if (@s1);\r
+ print join("\t", @s2), "\n" if (@s2 && $is_paired);\r
+ }\r
+ close($fh1);\r
+ if($is_paired) {\r
+ while(my $read2line = <$fh2>){\r
+ $export_line_count++;\r
+ die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n");\r
+ }\r
+ close($fh2);\r
+ }\r
+}\r
+\r
+sub export2sam_aux {\r
+ my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_;\r
+ chomp($line);\r
+ my @t = split("\t", $line);\r
+ if(scalar(@t) < EXPORT_SIZE) {\r
+ my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . EXPORT_SIZE . ".\n";\r
+ $msg.="\t...erroneous export record:\n" . $line . "\n\n";\r
+ die($msg);\r
+ }\r
+ @$s = ();\r
+ my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y');\r
+ return if(not ($isPassFilt or $is_nofilter));\r
+ # read name\r
+ my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : "");\r
+ $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]),\r
+ int($t[EXPORT_X]), int($t[EXPORT_Y]));\r
+ # initial flag (will be updated later)\r
+ $s->[SAM_FLAG] = 0;\r
+ if($is_paired) {\r
+ if($t[EXPORT_READNO] != $read_no){\r
+ die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n");\r
+ }\r
+ $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no);\r
+ }\r
+ $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt);\r
+\r
+ # read & quality\r
+ my $is_export_rev = ($t[EXPORT_STRAND] eq 'R');\r
+ if ($is_export_rev) { # then reverse the sequence and quality\r
+ $s->[SAM_SEQ] = reverse($t[EXPORT_READ]);\r
+ $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/;\r
+ $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]);\r
+ } else {\r
+ $s->[SAM_SEQ] = $t[EXPORT_READ];\r
+ $s->[SAM_QUAL] = $t[EXPORT_QUAL];\r
+ }\r
+ my @convqual = ();\r
+ foreach (unpack('C*', $s->[SAM_QUAL])){\r
+ my $val=$ct->[$_];\r
+ if(not defined $val){\r
+ my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n";\r
+ if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; }\r
+ die($msg . "\n");\r
+ }\r
+ push @convqual,$val;\r
+ }\r
+\r
+ $s->[SAM_QUAL] = pack('C*',@convqual); # change coding\r
+\r
+\r
+ # coor\r
+ my $has_coor = 0;\r
+ $s->[SAM_RNAME] = "*";\r
+ if (($t[EXPORT_CHROM] eq 'NM') or\r
+ ($t[EXPORT_CHROM] eq 'QC') or\r
+ ($t[EXPORT_CHROM] eq 'RM') or\r
+ ($t[EXPORT_CHROM] eq 'CONTROL')) {\r
+ $s->[SAM_FLAG] |= 0x4; # unmapped\r
+ push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM');\r
+ } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) {\r
+ $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?\r
+ push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")\r
+ } elsif ($t[EXPORT_POS] < 1) {\r
+ $s->[SAM_FLAG] |= 0x4; # unmapped\r
+ } else {\r
+ $s->[SAM_RNAME] = $t[EXPORT_CHROM];\r
+ $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne '');\r
+ $has_coor = 1;\r
+ }\r
+ $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0;\r
+\r
+# print STDERR "t[14] = " . $t[14] . "\n";\r
+ my $matchDesc = '';\r
+ $s->[SAM_CIGAR] = "*";\r
+ if($has_coor){\r
+ $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD];\r
+\r
+ if($matchDesc =~ /\^/){\r
+ # construct CIGAR string using Richard's function\r
+ $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing\r
+ } else {\r
+ $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M";\r
+ }\r
+ }\r
+\r
+# print STDERR "cigar_string = $cigar_string\n";\r
+\r
+ $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev);\r
+ if($has_coor){\r
+ my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0;\r
+ my $pemap = 0;\r
+ if($is_paired) {\r
+ $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0;\r
+\r
+ # set `proper pair' bit if non-blank, non-zero PE alignment score:\r
+ $s->[SAM_FLAG] |= 0x02 if ($pemap > 0);\r
+ }\r
+ $s->[SAM_MAPQ] = min(254,max($semap,$pemap));\r
+ } else {\r
+ $s->[SAM_MAPQ] = 0;\r
+ }\r
+ # mate coordinate\r
+ $s->[SAM_MRNM] = '*';\r
+ $s->[SAM_MPOS] = 0;\r
+ $s->[SAM_ISIZE] = 0;\r
+ # aux\r
+ push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]);\r
+ if($has_coor){\r
+ # The export match descriptor differs slightly from the samtools match descriptor.\r
+ # In order for the converted SAM files to be as compliant as possible,\r
+ # we put the export match descriptor in optional field 'XD' rather than 'MD':\r
+ push(@$s, "XD:Z:$matchDesc");\r
+ push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne '');\r
+ push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne ''));\r
+ }\r
+}\r
+\r
+\r
+\r
+#\r
+# the following code is taken from Richard Shaw's sorted2sam.pl file\r
+#\r
+sub reverse_compl_match_descriptor($)\r
+{\r
+# print "\nREVERSING THE MATCH DESCRIPTOR!\n";\r
+ my ($match_desc) = @_;\r
+ my $rev_compl_match_desc = reverse($match_desc);\r
+ $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/;\r
+\r
+ # Unreverse the digits of numbers.\r
+ $rev_compl_match_desc = join('',\r
+ map {($_ =~ /\d+/)\r
+ ? join('', reverse(split('', $_)))\r
+ : $_} split(/(\d+)/,\r
+ $rev_compl_match_desc));\r
+\r
+ return $rev_compl_match_desc;\r
+}\r
+\r
+\r
+\r
+sub match_desc_to_cigar($)\r
+{\r
+ my ($match_desc) = @_;\r
+\r
+ my @match_desc_parts = split(/(\^.*?\$)/, $match_desc);\r
+ my $cigar_str = '';\r
+ my $cigar_del_ch = 'D';\r
+ my $cigar_ins_ch = 'I';\r
+ my $cigar_match_ch = 'M';\r
+\r
+ foreach my $match_desc_part (@match_desc_parts) {\r
+ next if (!$match_desc_part);\r
+\r
+ if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) {\r
+ # Deletion\r
+ $cigar_str .= (length($1) . $cigar_del_ch);\r
+ } elsif ($match_desc_part =~ /^\^(\d+)\$$/) {\r
+ # Insertion\r
+ $cigar_str .= ($1 . $cigar_ins_ch);\r
+ } else {\r
+ $cigar_str .= (match_desc_frag_length($match_desc_part)\r
+ . $cigar_match_ch);\r
+ }\r
+ }\r
+\r
+ return $cigar_str;\r
+}\r
+\r
+\r
+#------------------------------------------------------------------------------\r
+\r
+sub match_desc_frag_length($)\r
+ {\r
+ my ($match_desc_str) = @_;\r
+ my $len = 0;\r
+\r
+ my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str);\r
+\r
+ foreach my $match_desc_field (@match_desc_fields) {\r
+ next if ($match_desc_field eq '');\r
+\r
+ $len += (($match_desc_field =~ /(\d+)/)\r
+ ? $1 : length($match_desc_field));\r
+ }\r
+\r
+ return $len;\r
+}\r
+\r
+\r
+# argument holds the command line\r
+sub write_header($;$;$) \r
+{\r
+ my ($progname,$version,$cl) = @_;\r
+ my $complete_header = "";\r
+ $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n";\r
+\r
+ return $complete_header;\r
+}\r
{
samfile_t *fp;
fp = (samfile_t*)calloc(1, sizeof(samfile_t));
- if (mode[0] == 'r') { // read
+ if (strchr(mode, 'r')) { // read
fp->type |= TYPE_READ;
- if (mode[1] == 'b') { // binary
+ if (strchr(mode, 'b')) { // binary
fp->type |= TYPE_BAM;
fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
if (fp->x.bam == 0) goto open_err_ret;
fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
} else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
}
- } else if (mode[0] == 'w') { // write
+ } else if (strchr(mode, 'w')) { // write
fp->header = bam_header_dup((const bam_header_t*)aux);
- if (mode[1] == 'b') { // binary
+ if (strchr(mode, 'b')) { // binary
char bmode[3];
- bmode[0] = 'w'; bmode[1] = strstr(mode, "u")? 'u' : 0; bmode[2] = 0;
+ int i, compress_level = -1;
+ for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0;
fp->type |= TYPE_BAM;
fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode);
if (fp->x.bam == 0) goto open_err_ret;
// open file
fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
if (fp->x.tamr == 0) goto open_err_ret;
- if (strstr(mode, "X")) fp->type |= BAM_OFSTR<<2;
- else if (strstr(mode, "x")) fp->type |= BAM_OFHEX<<2;
+ if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2;
+ else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2;
else fp->type |= BAM_OFDEC<<2;
// write header
- if (strstr(mode, "h")) {
+ if (strchr(mode, 'h')) {
int i;
bam_header_t *alt;
// parse the header text
int main_samview(int argc, char *argv[])
{
- int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0, is_count = 0;
+ int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, compress_level = -1, is_bamout = 0, slx2sngr = 0, is_count = 0;
int of_type = BAM_OFDEC, is_long_help = 0;
int count = 0;
samfile_t *in = 0, *out = 0;
/* parse command-line options */
strcpy(in_mode, "r"); strcpy(out_mode, "w");
- while ((c = getopt(argc, argv, "Sbct:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) {
+ while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:CR:")) >= 0) {
switch (c) {
case 'c': is_count = 1; break;
case 'C': slx2sngr = 1; break;
case 'f': g_flag_on = strtol(optarg, 0, 0); break;
case 'F': g_flag_off = strtol(optarg, 0, 0); break;
case 'q': g_min_mapQ = atoi(optarg); break;
- case 'u': is_uncompressed = 1; break;
+ case 'u': compress_level = 0; break;
+ case '1': compress_level = 1; break;
case 'l': g_library = strdup(optarg); break;
case 'r': g_rg = strdup(optarg); break;
case 'R': fn_rg = strdup(optarg); break;
default: return usage(is_long_help);
}
}
- if (is_uncompressed) is_bamout = 1;
+ if (compress_level >= 0) is_bamout = 1;
if (is_header_only) is_header = 1;
if (is_bamout) strcat(out_mode, "b");
else {
}
if (is_bamin) strcat(in_mode, "b");
if (is_header) strcat(out_mode, "h");
- if (is_uncompressed) strcat(out_mode, "u");
+ if (compress_level >= 0) {
+ char tmp[2];
+ tmp[0] = compress_level + '0'; tmp[1] = '\0';
+ strcat(out_mode, tmp);
+ }
if (argc == optind) return usage(is_long_help); // potential memory leak...
// read the list of read groups
fprintf(stderr, " -H print header only (no alignments)\n");
fprintf(stderr, " -S input is SAM\n");
fprintf(stderr, " -u uncompressed BAM output (force -b)\n");
+ fprintf(stderr, " -1 fast compression (force -b)\n");
fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n");
fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n");
fprintf(stderr, " -c print only the count of matching records\n");
-.TH samtools 1 "1 March 2011" "samtools-0.1.13" "Bioinformatics tools"
+.TH samtools 1 "16 March 2011" "samtools-0.1.14" "Bioinformatics tools"
.SH NAME
.PP
samtools - Utilities for the Sequence Alignment/Map (SAM) format
.B OPTIONS:
.RS
-.TP 8
+.TP 10
+.B -A
+Do not skip anomalous read pairs in variant calling.
+.TP
.B -B
Disable probabilistic realignment for the computation of base alignment
quality (BAQ). BAQ is the Phred-scaled probability of a read base being
about sqrt((INT-q)/INT)*INT. A zero value disables this
functionality; if enabled, the recommended value for BWA is 50. [0]
.TP
+.BI -d \ INT
+At a position, read maximally
+.I INT
+reads per input BAM. [250]
+.TP
+.B -D
+Output per-sample read depth
+.TP
.BI -e \ INT
Phred-scaled gap extension sequencing error probability. Reducing
.I INT
.IR INT * s / l .
[100]
.TP
+.B -I
+Do not perform INDEL calling
+.TP
.BI -l \ FILE
File containing a list of sites where pileup or BCF is outputted [null]
.TP
+.BI -L \ INT
+Skip INDEL calling if the average per-sample depth is above
+.IR INT .
+[250]
+.TP
.BI -o \ INT
Phred-scaled gap open sequencing error probability. Reducing
.I INT
.I STR
[all sites]
.TP
+.B -S
+Output per-sample Phred-scaled strand bias P-value
+.TP
.B -u
Similar to
.B -g
This command is much faster than replacing the header with a
BAM->SAM->BAM conversion.
+.TP
+.B cat
+samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [ ... ]
+
+Concatenate BAMs. The sequence dictionary of each input BAM must be identical,
+although this command does not check this. This command uses a similar trick
+to
+.B reheader
+which enables fast BAM concatenation.
+
.TP
.B sort
samtools sort [-no] [-m maxMem] <in.bam> <out.prefix>