X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=samtools.git;a=blobdiff_plain;f=bcftools%2Fbcf.tex;h=6f2171fa56ee39c6d1baa23da8e18794764ddcb5;hp=5ca1e282020d640733b7a88be53db8d3a26a6220;hb=c34624801b980425af68c3c431423c72b18c14fe;hpb=f2f3968e11eead9ce5601b01890bc2339ff951e9 diff --git a/bcftools/bcf.tex b/bcftools/bcf.tex index 5ca1e28..6f2171f 100644 --- a/bcftools/bcf.tex +++ b/bcftools/bcf.tex @@ -14,19 +14,19 @@ \begin{tabular}{|l|l|l|l|l|} \hline \multicolumn{2}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\hline\hline -\multicolumn{2}{|l|}{\tt magic} & Magic string & {\tt char[4]} & {\tt BCF\char92 4} \\\hline -\multicolumn{2}{|l|}{\tt l\_nm} & Length of concatenated sequence names & {\tt int32\_t} & \\\hline -\multicolumn{2}{|l|}{\tt name} & Concatenated names, {\tt NULL} padded & {\tt char[l\_nm]} & \\\hline -\multicolumn{2}{|l|}{\tt l\_smpl} & Length of concatenated sample names & {\tt int32\_t} & \\\hline -\multicolumn{2}{|l|}{\tt sname} & Concatenated sample names & {\tt char[l\_smpl]} & \\\hline -\multicolumn{2}{|l|}{\tt l\_txt} & Length of the meta text (double-hash lines)& {\tt int32\_t} & \\\hline -\multicolumn{2}{|l|}{\tt text} & Meta text, {\tt NULL} terminated & {\tt char[l\_txt]} & \\\hline +\multicolumn{2}{|l|}{\sf magic} & Magic string & {\tt char[4]} & {\tt BCF\char92 4} \\\hline +\multicolumn{2}{|l|}{\sf l\_seqnm} & Length of concatenated sequence names & {\tt int32\_t} & \\\hline +\multicolumn{2}{|l|}{\sf seqnm} & Concatenated names, {\tt NULL} padded & {\tt char[{\sf l\_seqnm}]} & \\\hline +\multicolumn{2}{|l|}{\sf l\_smpl} & Length of concatenated sample names & {\tt int32\_t} & \\\hline +\multicolumn{2}{|l|}{\sf smpl} & Concatenated sample names & {\tt char[{\sf l\_smpl}]} & \\\hline +\multicolumn{2}{|l|}{\sf l\_meta} & Length of the meta text (double-hash lines)& {\tt int32\_t} & \\\hline +\multicolumn{2}{|l|}{\sf meta} & Meta text, {\tt NULL} terminated & {\tt char[{\sf l\_meta}]} & \\\hline \multicolumn{5}{|c|}{\it \color{gray}{List of records until the end of the file}}\\\cline{2-5} -& {\tt seq\_id} & Reference sequence ID & {\tt int32\_t} & \\\cline{2-5} -& {\tt pos} & Position & {\tt int32\_t} & \\\cline{2-5} -& {\tt qual} & Variant quality & {\tt float} & \\\cline{2-5} -& {\tt l\_str} & Length of str & {\tt int32\_t} & \\\cline{2-5} -& {\tt str} & {\tt ID+REF+ALT+FILTER+INFO+FORMAT}, {\tt NULL} padded & {\tt char[slen]} &\\\cline{2-5} +& {\sf seq\_id} & Reference sequence ID & {\tt int32\_t} & \\\cline{2-5} +& {\sf pos} & Position & {\tt int32\_t} & \\\cline{2-5} +& {\sf qual} & Variant quality & {\tt float} & \\\cline{2-5} +& {\sf l\_str} & Length of {\sf str} & {\tt int32\_t} & \\\cline{2-5} +& {\sf str} & {\tt ID+REF+ALT+FILTER+INFO+FORMAT}, {\tt NULL} padded & {\tt char[{\sf l\_str}]} &\\\cline{2-5} & \multicolumn{4}{c|}{Blocks of data; \#blocks and formats defined by {\tt FORMAT} (table below)}\\ \hline \end{tabular} @@ -37,27 +37,40 @@ \hline \multicolumn{1}{l}{\bf Field} & \multicolumn{1}{l}{\bf Type} & \multicolumn{1}{l}{\bf Description} \\\hline {\tt DP} & {\tt uint16\_t[n]} & Read depth \\ -{\tt GL} & {\tt float[n*x]} & Log10 likelihood of data; $x=\frac{m(m+1)}{2}$, $m=\#\{alleles\}$\\ -{\tt GT} & {\tt uint8\_t[n]} & {\tt phase\char60\char60 6 | allele1\char60\char60 3 | allele2} \\ +{\tt GL} & {\tt float[n*G]} & Log10 likelihood of data; $G=\frac{A(A+1)}{2}$, $A=\#\{alleles\}$\\ +{\tt GT} & {\tt uint8\_t[n]} & {\tt missing\char60\char60 7 | phased\char60\char60 6 | allele1\char60\char60 3 | allele2} \\ +{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$} \\ {\tt GQ} & {\tt uint8\_t[n]} & {Genotype quality}\\ {\tt HQ} & {\tt uint8\_t[n*2]} & {Haplotype quality}\\ -{\tt PL} & {\tt uint8\_t[n*x]} & {Phred-scaled likelihood of data}\\ -\emph{misc} & {\tt int32\_t+char*} & {\tt NULL} padded concatenated strings (integer equal to the length) \\ +{\tt \_HQ} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic HQ}\\ +{\tt IBD} & {\tt uint32\_t[n*2]} & {IBD}\\ +{\tt \_IBD} & {\tt uint8\_t+uint32\_t[n*P]} & {Generic IBD}\\ +{\tt PL} & {\tt uint8\_t[n*G]} & {Phred-scaled likelihood of data}\\ +{\tt PS} & {\tt uint32\_t[n]} & {Phase set}\\ +%{\tt SP} & {\tt uint8\_t[n]} & {Strand bias P-value (bcftools only)}\\ +\emph{Integer} & {\tt int32\_t[n*X]} & {Fix-sized custom Integer; $X$ defined in the header}\\ +\emph{Numeric} & {\tt double[n*X]} & {Fix-sized custom Numeric}\\ +\emph{String} & {\tt uint32\_t+char*} & {\tt NULL} padded concat. strings (int equals to the length) \\ \hline \end{tabular} \end{center} \begin{itemize} -\item The file is {\tt BGZF} compressed. -\item All integers are little-endian. +\item A BCF file is in the {\tt BGZF} format. +\item All multi-byte numbers are little-endian. \item In a string, a missing value `.' is an empty C string ``{\tt \char92 0}'' (not ``{\tt .\char92 0}'') \item For {\tt GL} and {\tt PL}, likelihoods of genotypes appear in the order of alleles in {\tt REF} and then {\tt ALT}. For example, if {\tt REF=C}, {\tt ALT=T,A}, likelihoods appear in the order of {\tt - CC,CT,CA,TT,TA,AA}. -\item {\tt GL} is an extension to and is backward compatible with the - {\tt GL} genotype field in {\tt VCFv4.0}. + CC,CT,TT,CA,TA,AA} (NB: the ordering is different from the one in the original + BCF proposal). +\item Predefined {\tt FORMAT} fields can be missing from VCF headers, but custom {\tt FORMAT} fields + are required to be explicitly defined in the headers. +\item A {\tt FORMAT} field with its name starting with `{\tt \_}' gives an alternative + binary representation of the corresponding VCF field. The alternative representation + is used when the default representation is unable to keep the genotype information, + for example, when the ploidy is over 2 or there are more than 8 alleles. \end{itemize} -\end{document} \ No newline at end of file +\end{document}