1 '''Tools for working with files in the samtools pileup -c format.'''
5 PileupSubstitution = collections.namedtuple( "PileupSubstitution",
16 "base_qualities" ) ) )
18 PileupIndel = collections.namedtuple( "PileupIndel",
34 def iterate( infile ):
35 '''iterate over ``samtools pileup -c`` formatted file.
37 *infile* can be any iterator over a lines.
39 The function yields named tuples of the type :class:`pysam.Pileup.PileupSubstitution`
40 or :class:`pysam.Pileup.PileupIndel`.
43 The parser converts to 0-based coordinates
46 conv_subst = (str,lambda x: int(x)-1,str,str,int,int,int,int,str,str)
47 conv_indel = (str,lambda x: int(x)-1,str,str,int,int,int,int,str,str,int,int,int)
53 yield PileupIndel( *[x(y) for x,y in zip(conv_indel,d) ] )
55 raise pysam.SamtoolsError( "parsing error in line: `%s`" % line)
58 yield PileupSubstitution( *[x(y) for x,y in zip(conv_subst,d) ] )
60 raise pysam.SamtoolsError( "parsing error in line: `%s`" % line)
63 'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T',
64 'AA': 'A', 'CC': 'C', 'GG': 'G', 'TT': 'T', 'UU': 'U',
86 ##------------------------------------------------------------
87 def encodeGenotype( code ):
88 '''encode genotypes like GG, GA into a one-letter code.
89 The returned code is lower case if code[0] < code[1], otherwise
92 return ENCODE_GENOTYPE[ code.upper() ]
94 def decodeGenotype( code ):
95 '''decode single letter genotypes like m, M into two letters.
96 This is the reverse operation to :meth:`encodeGenotype`.
98 return DECODE_GENOTYPE[ code ]
100 def translateIndelGenotypeFromVCF( vcf_genotypes, ref ):
101 '''translate indel from vcf to pileup format.'''
104 def getPrefix( s1, s2 ):
105 '''get common prefix of strings s1 and s2.'''
106 n = min( len( s1), len( s2 ) )
108 if s1[x] != s2[x]: return s1[:x]
111 def getSuffix( s1, s2 ):
112 '''get common sufix of strings s1 and s2.'''
113 n = min( len( s1), len( s2 ) )
114 if s1[-1] != s2[-1]: return ""
115 for x in range( -2, -n - 1, -1 ):
116 if s1[x] != s2[x]: return s1[x+1:]
119 def getGenotype( variant, ref ):
121 if variant == ref: return "*", 0
123 if len(ref) > len(variant):
125 if ref.startswith(variant):
126 return "-%s" % ref[len(variant):], len(variant) - 1
127 elif ref.endswith( variant ):
128 return "-%s" % ref[:-len(variant)], -1
130 prefix = getPrefix( ref, variant )
131 suffix = getSuffix( ref, variant )
132 shared = len(prefix) + len(suffix) - len(variant)
133 # print "-", prefix, suffix, ref, variant, shared, len(prefix), len(suffix), len(ref)
136 return "-%s" % ref[len(prefix):-(len(suffix)-shared)], len(prefix) - 1
138 elif len(ref) < len(variant):
140 if variant.startswith(ref):
141 return "+%s" % variant[len(ref):], len(ref) - 1
142 elif variant.endswith(ref):
143 return "+%s" % variant[:len(ref)], 0
145 prefix = getPrefix( ref, variant )
146 suffix = getSuffix( ref, variant )
147 shared = len(prefix) + len(suffix) - len(ref)
151 return "+%s" % variant[len(prefix):-(len(suffix)-shared)], len(prefix)
155 # in pileup, the position refers to the base
156 # after the coordinate, hence subtract 1
159 genotypes, offsets = [], []
162 for variant in vcf_genotypes:
164 g, offset = getGenotype( variant, ref )
168 genotypes.append( g )
169 if g != "*": offsets.append( offset )
177 assert len(set(offsets )) == 1, "multiple offsets for indel"
180 genotypes = "/".join( genotypes )
181 return genotypes, offset
183 def vcf2pileup( vcf, sample ):
184 '''convert vcf record to pileup record.'''
186 chromosome = vcf.contig
189 allelles = [reference] + vcf.alt
194 genotypes = data["GT"]
195 if len(genotypes) > 1:
196 raise ValueError( "only single genotype per position, %s" % (str(vcf)))
198 genotypes = genotypes[0]
201 if genotypes[0] == ".": return None
203 genotypes = [ allelles[int(x)] for x in genotypes if x != "/" ]
205 # snp_quality is "genotype quality"
206 snp_quality = consensus_quality = data.get( "GQ", [0])[0]
207 mapping_quality = vcf.info.get( "MQ", [0])[0]
208 coverage = data.get( "DP", 0)
210 if len(reference) > 1 or max([len(x) for x in vcf.alt] ) > 1:
212 genotype, offset = translateIndelGenotypeFromVCF( genotypes, reference )
214 return PileupIndel( chromosome,
230 genotype = encodeGenotype( "".join(genotypes) )
236 return PileupSubstitution( chromosome, pos, reference,
241 coverage, read_bases, base_qualities )
244 def iterate_from_vcf( infile, sample ):
245 '''iterate over a vcf-formatted file.
247 *infile* can be any iterator over a lines.
249 The function yields named tuples of the type :class:`pysam.Pileup.PileupSubstitution`
250 or :class:`pysam.Pileup.PileupIndel`.
252 Positions without a snp will be skipped.
254 This method is wasteful and written to support same
255 legacy code that expects samtools pileup output.
257 Better use the vcf parser directly.
263 vcf.connect( infile )
265 if sample not in vcf.getsamples():
266 raise KeyErorr( "sample %s not vcf file" )
268 for row in vcf.fetch():
269 result = vcf2pileup( row, sample )
270 if result: yield result