# Install bcftools, its manpage, bcf-fix.pl, vcfutils.pl, and new examples.

[samtools.git] / samtools.1
diff --git a/samtools.1 b/samtools.1

index d79d176468ebe53f41bd45e7447cd1d2b0704a4e..cc421d4bc34fd53efa123b4630fcbccb4ca58708 100644 (file)
--- a/samtools.1
+++ b/samtools.1
@@ -1,4 +1,4 @@
-.TH samtools 1 "11 July 2010" "samtools-0.1.8" "Bioinformatics tools"
+.TH samtools 1 "27 October 2010" "samtools-0.1.9" "Bioinformatics tools"
  .SH NAME
  .PP
  samtools - Utilities for the Sequence Alignment/Map (SAM) format
  .SH NAME
  .PP
  samtools - Utilities for the Sequence Alignment/Map (SAM) format
@@ -18,9 +18,9 @@ samtools merge out.bam in1.bam in2.bam in3.bam
  .PP
  samtools faidx ref.fasta
  .PP
  .PP
  samtools faidx ref.fasta
  .PP
-samtools pileup -f ref.fasta aln.sorted.bam
+samtools pileup -vcf ref.fasta aln.sorted.bam
  .PP
  .PP
-samtools mpileup -f ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam
+samtools mpileup -C50 -agf ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam
  .PP
  samtools tview aln.sorted.bam ref.fasta
  
  .PP
  samtools tview aln.sorted.bam ref.fasta
  
@@ -65,10 +65,12 @@ format: `chr2' (the whole chr2), `chr2:1000000' (region starting from
  .B -b
  Output in the BAM format.
  .TP
  .B -b
  Output in the BAM format.
  .TP
-.B -u
-Output uncompressed BAM. This option saves time spent on
-compression/decomprssion and is thus preferred when the output is piped
-to another samtools command.
+.BI -f " INT"
+Only output alignments with all bits in INT present in the FLAG
+field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0]
+.TP
+.BI -F " INT"
+Skip alignments with bits present in INT [0]
  .TP
  .B -h
  Include the header in the output.
  .TP
  .B -h
  Include the header in the output.
@@ -76,12 +78,29 @@ Include the header in the output.
  .B -H
  Output the header only.
  .TP
  .B -H
  Output the header only.
  .TP
+.BI -l " STR"
+Only output reads in library STR [null]
+.TP
+.BI -o " FILE"
+Output file [stdout]
+.TP
+.BI -q " INT"
+Skip alignments with MAPQ smaller than INT [0]
+.TP
+.BI -r " STR"
+Only output reads in read group STR [null]
+.TP
+.BI -R " FILE"
+Output reads in read groups listed in
+.I FILE
+[null]
+.TP
  .B -S
  Input is in SAM. If @SQ header lines are absent, the
  .B `-t'
  option is required.
  .TP
  .B -S
  Input is in SAM. If @SQ header lines are absent, the
  .B `-t'
  option is required.
  .TP
-.B -t FILE
+.BI -t " FILE"
  This file is TAB-delimited. Each line must contain the reference name
  and the length of the reference, one line for each distinct reference;
  additional fields are ignored. This file also defines the order of the
  This file is TAB-delimited. Each line must contain the reference name
  and the length of the reference, one line for each distinct reference;
  additional fields are ignored. This file also defines the order of the
@@ -92,29 +111,10 @@ can be used as this
  .I <in.ref_list>
  file.
  .TP
  .I <in.ref_list>
  file.
  .TP
-.B -o FILE
-Output file [stdout]
-.TP
-.B -f INT
-Only output alignments with all bits in INT present in the FLAG
-field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0]
-.TP
-.B -F INT
-Skip alignments with bits present in INT [0]
-.TP
-.B -q INT
-Skip alignments with MAPQ smaller than INT [0]
-.TP
-.B -l STR
-Only output reads in library STR [null]
-.TP
-.B -r STR
-Only output reads in read group STR [null]
-.TP
-.B -R FILE
-Output reads in read groups listed in
-.I FILE
-[null]
+.B -u
+Output uncompressed BAM. This option saves time spent on
+compression/decomprssion and is thus preferred when the output is piped
+to another samtools command.
  .RE
  
  .TP
  .RE
  
  .TP
@@ -128,8 +128,10 @@ viewing the same reference sequence.
  
  .TP
  .B pileup
  
  .TP
  .B pileup
-samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list]
-[-iscgS2] [-T theta] [-N nHap] [-r pairDiffRate] <in.bam>|<in.sam>
+samtools pileup [-2sSBicv] [-f in.ref.fasta] [-t in.ref_list] [-l
+in.site_list] [-C capMapQ] [-M maxMapQ] [-T theta] [-N nHap] [-r
+pairDiffRate] [-m mask] [-d maxIndelDepth] [-G indelPrior]
+<in.bam>|<in.sam>
  
  Print the alignment in the pileup format. In the pileup format, each
  line represents a genomic position, consisting of chromosome name,
  
  Print the alignment in the pileup format. In the pileup format, each
  line represents a genomic position, consisting of chromosome name,
@@ -138,17 +140,17 @@ mapping qualities. Information on match, mismatch, indel, strand,
  mapping quality and start and end of a read are all encoded at the read
  base column. At this column, a dot stands for a match to the reference
  base on the forward strand, a comma for a match on the reverse strand,
  mapping quality and start and end of a read are all encoded at the read
  base column. At this column, a dot stands for a match to the reference
  base on the forward strand, a comma for a match on the reverse strand,
-`ACGTN' for a mismatch on the forward strand and `acgtn' for a mismatch
-on the reverse strand. A pattern `\\+[0-9]+[ACGTNacgtn]+' indicates
-there is an insertion between this reference position and the next
-reference position. The length of the insertion is given by the integer
-in the pattern, followed by the inserted sequence. Similarly, a pattern
-`-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference. The
-deleted bases will be presented as `*' in the following lines. Also at
-the read base column, a symbol `^' marks the start of a read segment
-which is a contiguous subsequence on the read separated by `N/S/H' CIGAR
-operations. The ASCII of the character following `^' minus 33 gives the
-mapping quality. A symbol `$' marks the end of a read segment.
+a '>' or '<' for a reference skip, `ACGTN' for a mismatch on the forward
+strand and `acgtn' for a mismatch on the reverse strand. A pattern
+`\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this
+reference position and the next reference position. The length of the
+insertion is given by the integer in the pattern, followed by the
+inserted sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+'
+represents a deletion from the reference. The deleted bases will be
+presented as `*' in the following lines. Also at the read base column, a
+symbol `^' marks the start of a read. The ASCII of the character
+following `^' minus 33 gives the mapping quality. A symbol `$' marks the
+end of a read segment.
  
  If option
  .B -c
  
  If option
  .B -c
@@ -168,102 +170,139 @@ The position of indels is offset by -1.
  .B OPTIONS:
  .RS
  .TP 10
  .B OPTIONS:
  .RS
  .TP 10
-.B -s
-Print the mapping quality as the last column. This option makes the
-output easier to parse, although this format is not space efficient.
+.B -B
+Disable the BAQ computation. See the
+.B mpileup
+command for details.
  .TP
  .TP
-.B -S
-The input file is in SAM.
+.B -c
+Call the consensus sequence using SOAPsnp consensus model. Options
+.BR -T ", " -N ", " -I " and " -r
+are only effective when
+.BR -c " or " -g
+is in use.
  .TP
  .TP
-.B -i
-Only output pileup lines containing indels.
+.BI -C " INT"
+Coefficient for downgrading the mapping quality of poorly mapped
+reads. See the
+.B mpileup
+command for details. [0]
  .TP
  .TP
-.B -f FILE
+.BI -d " INT"
+Use the first
+.I NUM
+reads in the pileup for indel calling for speed up. Zero for unlimited. [1024]
+.TP
+.BI -f " FILE"
  The reference sequence in the FASTA format. Index file
  .I FILE.fai
  will be created if
  absent.
  .TP
  The reference sequence in the FASTA format. Index file
  .I FILE.fai
  will be created if
  absent.
  .TP
-.B -M INT
-Cap mapping quality at INT [60]
+.B -g
+Generate genotype likelihood in the binary GLFv3 format. This option
+suppresses -c, -i and -s. This option is deprecated by the
+.B mpileup
+command.
  .TP
  .TP
-.B -m INT
+.B -i
+Only output pileup lines containing indels.
+.TP
+.BI -I " INT"
+Phred probability of an indel in sequencing/prep. [40]
+.TP
+.BI -l " FILE"
+List of sites at which pileup is output. This file is space
+delimited. The first two columns are required to be chromosome and
+1-based coordinate. Additional columns are ignored. It is
+recommended to use option
+.TP
+.BI -m " INT"
  Filter reads with flag containing bits in
  Filter reads with flag containing bits in
-.I
-INT
+.I INT
  [1796]
  .TP
  [1796]
  .TP
-.B -d INT
-Use the first
-.I NUM
-reads in the pileup for indel calling for speed up. Zero for unlimited. [0]
+.BI -M " INT"
+Cap mapping quality at INT [60]
+.TP
+.BI -N " INT"
+Number of haplotypes in the sample (>=2) [2]
  .TP
  .TP
-.B -t FILE
+.BI -r " FLOAT"
+Expected fraction of differences between a pair of haplotypes [0.001]
+.TP
+.B -s
+Print the mapping quality as the last column. This option makes the
+output easier to parse, although this format is not space efficient.
+.TP
+.B -S
+The input file is in SAM.
+.TP
+.BI -t " FILE"
  List of reference names ane sequence lengths, in the format described
  for the
  .B import
  command. If this option is present, samtools assumes the input
  .I <in.alignment>
  is in SAM format; otherwise it assumes in BAM format.
  List of reference names ane sequence lengths, in the format described
  for the
  .B import
  command. If this option is present, samtools assumes the input
  .I <in.alignment>
  is in SAM format; otherwise it assumes in BAM format.
-.TP
-.B -l FILE
-List of sites at which pileup is output. This file is space
-delimited. The first two columns are required to be chromosome and
-1-based coordinate. Additional columns are ignored. It is
-recommended to use option
  .B -s
  together with
  .B -l
  as in the default format we may not know the mapping quality.
  .TP
  .B -s
  together with
  .B -l
  as in the default format we may not know the mapping quality.
  .TP
-.B -c
-Call the consensus sequence using SOAPsnp consensus model. Options
-.B -T,
-.B -N,
-.B -I
-and
-.B -r
-are only effective when
-.B -c
-or
-.B -g
-is in use.
-.TP
-.B -g
-Generate genotype likelihood in the binary GLFv3 format. This option
-suppresses -c, -i and -s.
-.TP
-.B -T FLOAT
+.BI -T " FLOAT"
  The theta parameter (error dependency coefficient) in the maq consensus
  calling model [0.85]
  The theta parameter (error dependency coefficient) in the maq consensus
  calling model [0.85]
-.TP
-.B -N INT
-Number of haplotypes in the sample (>=2) [2]
-.TP
-.B -r FLOAT
-Expected fraction of differences between a pair of haplotypes [0.001]
-.TP
-.B -I INT
-Phred probability of an indel in sequencing/prep. [40]
  .RE
  
  .TP
  .B mpileup
  .RE
  
  .TP
  .B mpileup
-samtools mpileup [-r reg] [-f in.fa] in.bam [in2.bam [...]]
+samtools mpileup [-Bug] [-C capQcoef] [-r reg] [-f in.fa] [-l list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam [...]]
  
  
-Generate pileup for multiple BAM files. Consensus calling is not
-implemented.
+Generate BCF or pileup for one or multiple BAM files. Alignment records
+are grouped by sample identifiers in @RG header lines. If sample
+identifiers are absent, each input file is regarded as one sample.
  
  .B OPTIONS:
  .RS
  .TP 8
  
  .B OPTIONS:
  .RS
  .TP 8
-.B -r STR
+.B -B
+Disable probabilistic realignment for the computation of base alignment
+quality (BAQ). BAQ is the Phred-scaled probability of a read base being
+misaligned. Applying this option greatly helps to reduce false SNPs
+caused by misalignments.
+.TP
+.BI -C " INT"
+Coefficient for downgrading mapping quality for reads containing
+excessive mismatches. Given a read with a phred-scaled probability q of
+being generated from the mapped position, the new mapping quality is
+about sqrt((INT-q)/INT)*INT. A zero value disables this
+functionality; if enabled, the recommended value is 50. [0]
+.TP
+.BI -f " FILE"
+The reference file [null]
+.TP
+.B -g
+Compute genotype likelihoods and output them in the binary call format (BCF).
+.TP
+.B -u
+Similar to
+.B -g
+except that the output is uncompressed BCF, which is preferred for pipeing.
+.TP
+.BI -l " FILE"
+File containing a list of sites where pileup or BCF is outputted [null]
+.TP
+.BI -q " INT"
+Minimum mapping quality for an alignment to be used [0]
+.TP
+.BI -Q " INT"
+Minimum base quality for a base to be considered [13]
+.TP
+.BI -r " STR"
  Only generate pileup in region
  .I STR
  [all sites]
  Only generate pileup in region
  .I STR
  [all sites]
-.TP
-.B -f FILE
-The reference file [null]
  .RE
  
  .TP
  .RE
  
  .TP
@@ -303,7 +342,7 @@ Approximately the maximum required memory. [500000000]
  
  .TP
  .B merge
  
  .TP
  .B merge
-samtools merge [-h inh.sam] [-nr] <out.bam> <in1.bam> <in2.bam> [...]
+samtools merge [-nur] [-h inh.sam] [-R reg] <out.bam> <in1.bam> <in2.bam> [...]
  
  Merge multiple sorted alignments.
  The header reference lists of all the input BAM files, and the @SQ headers of
  
  Merge multiple sorted alignments.
  The header reference lists of all the input BAM files, and the @SQ headers of
@@ -320,7 +359,7 @@ and the headers of other files will be ignored.
  .B OPTIONS:
  .RS
  .TP 8
  .B OPTIONS:
  .RS
  .TP 8
-.B -h FILE
+.BI -h " FILE"
  Use the lines of
  .I FILE
  as `@' headers to be copied to
  Use the lines of
  .I FILE
  as `@' headers to be copied to
@@ -331,12 +370,19 @@ replacing any header lines that would otherwise be copied from
  is actually in SAM format, though any alignment records it may contain
  are ignored.)
  .TP
  is actually in SAM format, though any alignment records it may contain
  are ignored.)
  .TP
+.BI -R " STR"
+Merge files in the specified region indicated by
+.I STR
+.TP
  .B -r
  Attach an RG tag to each alignment. The tag value is inferred from file names.
  .TP
  .B -n
  The input alignments are sorted by read names rather than by chromosomal
  coordinates
  .B -r
  Attach an RG tag to each alignment. The tag value is inferred from file names.
  .TP
  .B -n
  The input alignments are sorted by read names rather than by chromosomal
  coordinates
+.TP
+.B -u
+Uncompressed BAM output
  .RE
  
  .TP
  .RE
  
  .TP
@@ -402,7 +448,7 @@ Treat paired-end reads and single-end reads.
  
  .TP
  .B calmd
  
  .TP
  .B calmd
-samtools calmd [-eubS] <aln.bam> <ref.fasta>
+samtools calmd [-eubSr] [-C capQcoef] <aln.bam> <ref.fasta>
  
  Generate the MD tag. If the MD tag is already present, this command will
  give a warning if the MD tag generated is different from the existing
  
  Generate the MD tag. If the MD tag is already present, this command will
  give a warning if the MD tag generated is different from the existing
@@ -423,6 +469,15 @@ Output compressed BAM
  .TP
  .B -S
  The input is SAM with header lines
  .TP
  .B -S
  The input is SAM with header lines
+.TP
+.BI -C " INT"
+Coefficient to cap mapping quality of poorly mapped reads. See the
+.B pileup
+command for details. [0]
+.TP
+.B -r
+Perform probabilistic realignment to compute BAQ, which will be used to
+cap base quality.
  .RE
  
  .SH SAM FORMAT
  .RE
  
  .SH SAM FORMAT
@@ -472,6 +527,125 @@ _
  0x0400 d       the read is either a PCR or an optical duplicate
  .TE
  
  0x0400 d       the read is either a PCR or an optical duplicate
  .TE
  
+.SH EXAMPLES
+.IP o 2
+Import SAM to BAM when
+.B @SQ
+lines are present in the header:
+
+  samtools view -bS aln.sam > aln.bam
+
+If
+.B @SQ
+lines are absent:
+
+  samtools faidx ref.fa
+  samtools view -bt ref.fa.fai aln.sam > aln.bam
+
+where
+.I ref.fa.fai
+is generated automatically by the
+.B faidx
+command.
+
+.IP o 2
+Attach the
+.B RG
+tag while merging sorted alignments:
+
+  perl -e 'print "@RG\\tID:ga\\tSM:hs\\tLB:ga\\tPL:Illumina\\n@RG\\tID:454\\tSM:hs\\tLB:454\\tPL:454\\n"' > rg.txt
+  samtools merge -rh rg.txt merged.bam ga.bam 454.bam
+
+The value in a
+.B RG
+tag is determined by the file name the read is coming from. In this
+example, in the
+.IR merged.bam ,
+reads from
+.I ga.bam
+will be attached 
+.IR RG:Z:ga ,
+while reads from
+.I 454.bam
+will be attached
+.IR RG:Z:454 .
+
+.IP o 2
+Call SNPs and short indels for one diploid individual:
+
+  samtools pileup -vcf ref.fa aln.bam > var.raw.plp
+  samtools.pl varFilter -D 100 var.raw.plp > var.flt.plp
+  awk '($3=="*"&&$6>=50)||($3!="*"&&$6>=20)' var.flt.plp > var.final.plp
+
+The
+.B -D
+option of varFilter controls the maximum read depth, which should be
+adjusted to about twice the average read depth.  One may consider to add
+.B -C50
+to
+.B pileup
+if mapping quality is overestimated for reads containing excessive
+mismatches. Applying this option usually helps
+.B BWA-short
+but may not other mappers. It also potentially increases reference
+biases.
+
+.IP o 2
+Call SNPs (not short indels) for multiple diploid individuals:
+
+  samtools mpileup -augf ref.fa *.bam | bcftools view -bcv - > snp.raw.bcf
+  bcftools view snp.raw.bcf | vcfutils.pl filter4vcf -D 2000 | bgzip > snp.flt.vcf.gz
+
+Individuals are identified from the
+.B SM
+tags in the
+.B @RG
+header lines. Individuals can be pooled in one alignment file; one
+individual can also be separated into multiple files. Similarly, one may
+consider to apply
+.B -C50
+to
+.BR mpileup .
+SNP calling in this way also works for single sample and has the
+advantage of enabling more powerful filtering. The drawback is the lack
+of short indel calling, which may be implemented in future.
+
+.IP o 2
+Derive the allele frequency spectrum (AFS) on a list of sites from multiple individuals:
+
+  samtools mpileup -gf ref.fa *.bam > all.bcf
+  bcftools view -bl sites.list all.bcf > sites.bcf
+  bcftools view -cGP cond2 sites.bcf > /dev/null 2> sites.1.afs
+  bcftools view -cGP sites.1.afs sites.bcf > /dev/null 2> sites.2.afs
+  bcftools view -cGP sites.2.afs sites.bcf > /dev/null 2> sites.3.afs
+  ......
+
+where
+.I sites.list
+contains the list of sites with each line consisting of the reference
+sequence name and position. The following
+.B bcftools
+commands estimate AFS by EM.
+
+.IP o 2
+Dump BAQ applied alignment for other SNP callers:
+
+  samtools calmd -br aln.bam > aln.baq.bam
+
+It adds and corrects the
+.B NM
+and
+.B MD
+tags at the same time. The
+.B calmd
+command also comes with the
+.B -C
+option, the same as the on in
+.B pileup
+and
+.BR mpileup .
+Apply if it helps.
+
  .SH LIMITATIONS
  .PP
  .IP o 2
  .SH LIMITATIONS
  .PP
  .IP o 2