2 # Code to read, write and edit VCF files
4 # VCF lines are encoded as a dictionary with these keys (note: all lowercase):
9 # 'alt': list of strings
11 # 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"]
12 # 'info': dictionary of values (see below)
13 # 'format': list of keys (strings)
14 # sample keys: dictionary of values (see below)
16 # The sample keys are accessible through vcf.getsamples()
18 # A dictionary of values contains value keys (defined in ##INFO or ##FORMAT lines) which map
19 # to a list, containign integers, floats, strings, or characters. Missing values are replaced
20 # by a particular value, often -1 or .
22 # Genotypes are not stored as a string, but as a list of 1 or 3 elements (for haploid and diploid samples),
23 # the first (and last) the integer representing an allele, and the second the separation character.
24 # Note that there is just one genotype per sample, but for consistency the single element is stored in a list.
26 # Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as (key, value) pairs and are accessible
29 # The VCF class can be instantiated with a 'regions' variable consisting of tuples (chrom,start,end) encoding
30 # 0-based half-open segments. Only variants with a position inside the segment will be parsed. A regions
31 # parser is available under parse_regions.
33 # When instantiated, a reference can be passed to the VCF class. This may be any class that supports a
34 # fetch(chrom, start, end) method.
38 # NOTE: the position that is returned to Python is 0-based, NOT 1-based as in the VCF file.
43 # only v4.0 writing is complete; alleles are not converted to v3.3 format
46 from collections import namedtuple, defaultdict
47 from operator import itemgetter
48 import sys, re, copy, bisect
54 gtsRegEx = re.compile("[|/\\\\]")
55 alleleRegEx = re.compile('^[ACGTN]+$')
57 # Utility function. Uses 0-based coordinates
58 def get_sequence(chrom, start, end, fa):
59 # obtain sequence from .fa file, without truncation
60 if end<=start: return ""
61 if not fa: return "N"*(end-start)
62 if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper()
63 sequence = fa.fetch(chrom, start, end).upper()
64 if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence))
67 # Utility function. Parses a region string
68 def parse_regions( string ):
70 for r in string.split(','):
72 chrom, start, end = elts[0], 0, 3000000000
76 ielts = elts[1].split('-')
77 if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r)
78 try: start, end = int(ielts[0])-1, int(ielts[1])
79 except: raise ValueError("Don't understand region string '%s'" % r)
81 raise ValueError("Don't understand region string '%s'" % r)
82 result.append( (chrom,start,end) )
86 FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue')
88 ###########################################################################################################
92 ###########################################################################################################
94 cdef class VCFRecord( TabProxies.TupleProxy):
97 initialized from data and vcf meta
104 def __init__(self, vcf):
106 # if len(data) != len(self.vcf._samples):
107 # self.error(str(data),
108 # self.BAD_NUMBER_OF_COLUMNS,
109 # "expected %s for %s samples (%s), got %s" % \
110 # (len(self.vcf._samples),
111 # len(self.vcf._samples),
115 def __cinit__(self, vcf ):
116 # start indexed access at genotypes
121 def error(self,line,error,opt=None):
123 # pass to vcf file for error handling
124 return self.vcf.error( line, error, opt )
126 cdef update( self, char * buffer, size_t nbytes ):
127 '''update internal data.
129 nbytes does not include the terminal '\0'.
131 TabProxies.TupleProxy.update( self, buffer, nbytes )
133 self.contig = self.fields[0]
134 # vcf counts from 1 - correct here
135 self.pos = atoi( self.fields[1] ) - 1
138 return max(0, self.nfields - 9)
141 def __get__(self): return self.contig
144 def __get__(self): return self.pos
147 def __get__(self): return self.fields[2]
150 def __get__(self): return self.fields[3]
154 # convert v3.3 to v4.0 alleles below
156 if alt == ".": alt = []
157 else: alt = alt.upper().split(',')
162 qual = self.fields[5]
163 if qual == b".": qual = -1
165 try: qual = float(qual)
166 except: self.vcf.error(str(self),self.QUAL_NOT_NUMERICAL)
171 # postpone checking that filters exist. Encode missing filter or no filtering as empty list
172 if f == b"." or f == b"PASS" or f == b"0": return []
173 else: return f.split(';')
178 # dictionary of keys, and list of values
181 for blurp in col.split(';'):
182 elts = blurp.split('=')
183 if len(elts) == 1: v = None
184 elif len(elts) == 2: v = elts[1]
185 else: self.vcf.error(str(self),self.ERROR_INFO_STRING)
186 info[elts[0]] = self.vcf.parse_formatdata(elts[0], v, self.vcf._info, str(self))
191 return self.fields[8].split(':')
195 return self.vcf._samples
197 def __getitem__(self, key):
199 # parse sample columns
200 values = self.fields[self.vcf._sample2column[key]].split(':')
204 if len(values) > len(format):
205 self.error(str(self.line),self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" %\
206 (len(values),key,len(format)))
209 for idx in range(len(format)):
210 expected = self.vcf.get_expected(format[idx], self.vcf._format, alt)
211 if idx < len(values): value = values[idx]
213 if expected == -1: value = "."
214 else: value = ",".join(["."]*expected)
216 result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, str(self.data))
217 if expected != -1 and len(result[format[idx]]) != expected:
218 self.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS,
219 "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]]))
220 if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]]))
221 result[format[idx]] = result[format[idx]][:expected]
226 cdef class asVCFRecord( ctabix.Parser ):
227 '''converts a :term:`tabix row` into a VCF record.'''
229 def __init__(self, vcffile ):
230 self.vcffile = vcffile
231 def __call__(self, char * buffer, int len ):
233 r = VCFRecord( self.vcffile )
234 r.copy( buffer, len )
245 NT_PHASED_GENOTYPES = 5
247 _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier",
248 1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string",
249 2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s",
250 3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)",
251 4:"POS_NOT_NUMERICAL:Position column is not numerical",
252 5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field",
253 6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF",
254 7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF",
255 8:"POS_NOT_POSITIVE:Position field must be >0",
256 9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'",
257 10:"ERROR_INFO_STRING:Error while parsing info field",
258 11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)",
259 12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s",
260 13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string",
261 14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header",
262 15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header",
263 16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)",
264 17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)",
265 18:"BAD_GENOTYPE:Cannot parse genotype (%s)",
266 19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)",
267 20:"MISSING_REF:Reference allele missing",
268 21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)",
269 22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets",
270 23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes",
271 24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields",
272 25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs",
273 26:"WRONG_REF:Wrong reference %s",
274 27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data",
275 28:"BAD_CHR_TAG:Error calculating chr tag for %s",
276 29:"ZERO_LENGTH_ALLELE:Found zero-length allele",
277 30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base",
278 31:"ZERO_FOR_NON_FLAG_FIELD: number set to 0, but type is not 'FLAG'",
279 32:"ERROR_FORMAT_NOT_INTEGER:Expected integer in formatted field; got %s",
280 33:"ERROR_FLAG_HAS_VALUE:Flag fields should not have a value",
283 # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields
286 # version number; 33=v3.3; 40=v4.0
289 # info, filter and format data
294 # header; and required columns
295 _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
299 _ignored_errors = set([11,31]) # ERROR_UNKNOWN_KEY, ERROR_ZERO_FOR_NON_FLAG_FIELD
300 _warn_errors = set([])
306 # regions to include; None includes everything
314 def __init__(self, _copy=None, reference=None, regions=None, lines=None, leftalign=False):
315 # make error identifiers accessible by name
316 for id in self._errors.keys(): self.__dict__[self._errors[id].split(':')[0]] = id
318 self._leftalign = _copy._leftalign
319 self._header = _copy._header[:]
320 self._version = _copy._version
321 self._info = copy.deepcopy(_copy._info)
322 self._filter = copy.deepcopy(_copy._filter)
323 self._format = copy.deepcopy(_copy._format)
324 self._samples = _copy._samples[:]
325 self._sample2column = copy.deepcopy(_copy._sample2column)
326 self._ignored_errors = copy.deepcopy(_copy._ignored_errors)
327 self._warn_errors = copy.deepcopy(_copy._warn_errors)
328 self._reference = _copy._reference
329 self._regions = _copy._regions
330 if reference: self._reference = reference
331 if regions: self._regions = regions
332 if leftalign: self._leftalign = leftalign
335 def error(self,line,error,opt=None):
336 if error in self._ignored_errors: return
337 errorlabel, errorstring = self._errors[error].split(':')
338 if opt: errorstring = errorstring % opt
339 errwarn = ["Error","Warning"][error in self._warn_errors]
340 errorstring += " in line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring)
341 if error in self._warn_errors: return
342 raise ValueError(errorstring)
344 def parse_format(self,line,format,filter=False):
345 if self._version == 40:
346 if not format.startswith('<'):
347 self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
349 if not format.endswith('>'):
350 self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
352 format = format[1:-1]
353 data = {'id':None,'number':None,'type':None,'descr':None}
355 while len(format.strip())>0:
356 elts = format.strip().split(',')
357 first, rest = elts[0], ','.join(elts[1:])
358 if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')):
359 if self._version == 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS)
360 if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
361 first = ["ID=","Number=","Type=","Description="][idx] + first
362 if first.startswith('ID='): data['id'] = first.split('=')[1]
363 elif first.startswith('Number='): data['number'] = first.split('=')[1]
364 elif first.startswith('Type='): data['type'] = first.split('=')[1]
365 elif first.startswith('Description='):
366 elts = format.split('"')
368 self.error(line,self.FORMAT_MISSING_QUOTES)
369 elts = first.split('=') + [rest]
370 data['descr'] = elts[1]
371 rest = '"'.join(elts[2:])
372 if rest.startswith(','): rest = rest[1:]
374 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
377 if filter and idx==1: idx=3 # skip number and type fields for FILTER format strings
378 if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
379 if 'descr' not in data:
380 # missing description
381 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
383 if not data['type'] and not data['number']:
384 # fine, ##filter format
385 return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.')
386 if not data['type'] in ["Integer","Float","Character","String","Flag"]:
387 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
388 # I would like a missing-value field, but it isn't there
389 if data['type'] in ['Integer','Float']: data['missing'] = None # Do NOT use arbitrary int/float as missing value
390 else: data['missing'] = '.'
391 if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
393 n = int(data['number'])
397 if data['number'] == '.': t = self.NT_UNKNOWN
398 elif data['number'] == '#alleles': t = self.NT_ALLELES
399 elif data['number'] == '#nonref_alleles': t = self.NT_NR_ALLELES
400 elif data['number'] == '#genotypes': t = self.NT_GENOTYPES
401 elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
402 elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
403 # abbreviations added in VCF version v4.1
404 elif data['number'] == 'A': t = self.NT_ALLELES
405 elif data['number'] == 'G': t = self.NT_GENOTYPES
407 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
408 # if number is 0 - type must be Flag
409 if n == 0 and data['type'] != 'Flag':
410 self.error( line, self.ZERO_FOR_NON_FLAG_FIELD)
411 # force type 'Flag' if no number
412 data['type'] = 'Flag'
414 return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing'])
416 def format_format( self, fmt, filter=False ):
417 values = [('ID',fmt.id)]
418 if fmt.number != None and not filter:
419 if fmt.numbertype == self.NT_UNKNOWN: nmb = "."
420 elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number)
421 elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles"
422 elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles"
423 elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes"
424 elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes"
426 raise ValueError("Unknown number type encountered: %s" % fmt.numbertype)
427 values.append( ('Number',nmb) )
428 values.append( ('Type', fmt.type) )
429 values.append( ('Description', '"' + fmt.description + '"') )
430 if self._version == 33:
431 format = ",".join([v for k,v in values])
433 format = "<" + (",".join( ["%s=%s" % (k,v) for (k,v) in values] )) + ">"
436 def get_expected(self, format, formatdict, alt):
437 fmt = formatdict[format]
438 if fmt.numbertype == self.NT_UNKNOWN: return -1
439 if fmt.numbertype == self.NT_NUMBER: return fmt.number
440 if fmt.numbertype == self.NT_ALLELES: return len(alt)+1
441 if fmt.numbertype == self.NT_NR_ALLELES: return len(alt)
442 if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2
443 if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1)
447 def _add_definition(self, formatdict, key, data, line ):
448 if key in formatdict: return
449 self.error(line,self.ERROR_UNKNOWN_KEY,key)
451 formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".")
453 if data == []: data = [""] # unsure what type -- say string
454 if type(data[0]) == type(0.0):
455 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None)
457 if type(data[0]) == type(0):
458 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None)
460 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".")
463 # todo: trim trailing missing values
464 def format_formatdata( self, data, format, key=True, value=True, separator=":" ):
465 output, sdata = [], []
466 if type(data) == type([]): # for FORMAT field, make data with dummy values
468 for k in data: d[k] = []
470 # convert missing values; and silently add definitions if required
472 self._add_definition( format, k, data[k], "(output)" )
473 for idx,v in enumerate(data[k]):
474 if v == format[k].missingvalue: data[k][idx] = "."
475 # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string
477 if k != 'GT': sdata.append( (k,data[k]) )
480 sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata
484 if v != None: output.append( k+"="+','.join(map(str,v)) )
485 else: output.append( k )
486 elif key: output.append(k)
488 if v != None: output.append( ','.join(map(str,v)) )
489 else: output.append( "." ) # should not happen
490 # snip off trailing missing data
491 while len(output) > 1:
492 last = output[-1].replace(',','').replace('.','')
493 if len(last)>0: break
495 return separator.join(output)
498 def enter_default_format(self):
499 for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'),
500 FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1),
501 FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.'),
502 FORMAT('GL',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
503 FORMAT('GLE',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
504 FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1),
505 FORMAT('PL',self.NT_GENOTYPES,-1,'Integer','Phred-scaled genotype likelihoods', '.'),
506 FORMAT('GP',self.NT_GENOTYPES,-1,'Float','Genotype posterior probabilities','.'),
507 FORMAT('GQ',self.NT_GENOTYPES,-1,'Integer','Conditional genotype quality','.'),
508 FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1), # unknown number, since may be haploid
509 FORMAT('PS',self.NT_UNKNOWN,-1,'Integer','Phase set','.'),
510 FORMAT('PQ',self.NT_NUMBER,1,'Integer','Phasing quality',-1),
511 FORMAT('EC',self.NT_ALLELES,1,'Integer','Expected alternate allel counts',-1),
512 FORMAT('MQ',self.NT_NUMBER,1,'Integer','RMS mapping quality',-1),
514 if f.id not in self._format:
515 self._format[f.id] = f
517 def parse_header( self, line ):
518 assert line.startswith('##')
519 elts = line[2:].split('=')
520 key = elts[0].strip()
521 value = '='.join(elts[1:]).strip()
522 if key == "fileformat":
523 if value == "VCFv3.3":
525 elif value == "VCFv4.0":
527 elif value == "VCFv4.1":
531 self.error(line,self.UNKNOWN_FORMAT_STRING)
533 f = self.parse_format(line, value)
534 self._info[ f.id ] = f
535 elif key == "FILTER":
536 f = self.parse_format(line, value, filter=True)
537 self._filter[ f.id ] = f
538 elif key == "FORMAT":
539 f = self.parse_format(line, value)
540 self._format[ f.id ] = f
542 # keep other keys in the header field
543 self._header.append( (key,value) )
546 def write_header( self, stream ):
547 stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10))
548 for key,value in self._header: stream.write("##%s=%s\n" % (key,value))
549 for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]:
550 for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER"))))
553 def parse_heading( self, line ):
554 assert line.startswith('#')
555 assert not line.startswith('##')
556 headings = line[1:].split('\t')
557 # test for 8, as FORMAT field might be missing
558 if len(headings)==1 and len(line[1:].split()) >= 8:
559 self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS)
560 headings = line[1:].split()
562 for i,s in enumerate(self._required):
564 if len(headings)<=i or headings[i] != s:
566 if len(headings) <= i:
567 err = "(%sth entry not found)" % (i+1)
569 err = "(found %s, expected %s)" % (headings[i],s)
571 #self.error(line,self.BADLY_FORMATTED_HEADING,err)
572 # allow FORMAT column to be absent
573 if len(headings) == 8:
574 headings.append("FORMAT")
576 self.error(line,self.BADLY_FORMATTED_HEADING,err)
578 self._samples = headings[9:]
579 self._sample2column = dict( [(y,x+9) for x,y in enumerate( self._samples ) ] )
581 def write_heading( self, stream ):
582 stream.write("#" + "\t".join(self._required + self._samples) + "\n")
584 def convertGT(self, GTstring):
585 if GTstring == ".": return ["."]
587 gts = gtsRegEx.split(GTstring)
588 if len(gts) == 1: return [int(gts[0])]
589 if len(gts) != 2: raise ValueError()
590 if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]]
591 return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])]
593 self.error(self._line,self.BAD_GENOTYPE,GTstring)
596 def convertGTback(self, GTdata):
597 return ''.join(map(str,GTdata))
599 def parse_formatdata( self, key, value, formatdict, line ):
600 # To do: check that the right number of values is present
601 f = formatdict.get(key,None)
603 self._add_definition(formatdict, key, value, line )
606 if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE)
608 values = value.split(',')
609 # deal with trailing data in some early VCF files
610 if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1:
611 self.error(line,self.ERROR_TRAILING_DATA,values[-1])
612 values[-1] = values[-1].split(';')[0]
613 if f.type == "Integer":
614 for idx,v in enumerate(values):
616 if v == ".": values[idx] = f.missingvalue
617 else: values[idx] = int(v)
619 self.error(line,self.ERROR_FORMAT_NOT_INTEGER,"%s=%s" % (key, str(values)))
620 return [0] * len(values)
622 elif f.type == "String":
624 if f.id == "GT": values = list(map( self.convertGT, values ))
626 elif f.type == "Character":
628 if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR)
630 elif f.type == "Float":
631 for idx,v in enumerate(values):
632 if v == ".": values[idx] = f.missingvalue
633 try: return list(map(float,values))
635 self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,"%s=%s" % (key, str(values)))
636 return [0.0] * len(values)
639 self.error(line,self.ERROR_INFO_STRING)
641 def inregion(self, chrom, pos):
642 if not self._regions: return True
643 for r in self._regions:
644 if r[0] == chrom and r[1] <= pos < r[2]: return True
647 def parse_data( self, line, lineparse=False ):
648 cols = line.split('\t')
649 if len(cols) != len(self._samples)+9:
650 # gracefully deal with absent FORMAT column
651 # and those missing samples
656 self.BAD_NUMBER_OF_COLUMNS,
657 "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols)))
661 # get 0-based position
662 try: pos = int(cols[1])-1
663 except: self.error(line,self.POS_NOT_NUMERICAL)
664 if pos < 0: self.error(line,self.POS_NOT_POSITIVE)
666 # implement filtering
667 if not self.inregion(chrom,pos): return None
669 # end of first-pass parse for sortedVCF
670 if lineparse: return chrom, pos, line
674 ref = cols[3].upper()
676 self.error(line,self.MISSING_REF)
677 if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
681 if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF)
682 if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference)
684 # make sure reference is sane
686 left = max(0,pos-100)
687 faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference)
688 faref = faref_leftflank[pos-left:]
689 if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
692 # convert v3.3 to v4.0 alleles below
693 if cols[4] == ".": alt = []
694 else: alt = cols[4].upper().split(',')
696 if cols[5] == ".": qual = -1
698 try: qual = float(cols[5])
699 except: self.error(line,self.QUAL_NOT_NUMERICAL)
701 # postpone checking that filters exist. Encode missing filter or no filtering as empty list
702 if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = []
703 else: filter = cols[6].split(';')
705 # dictionary of keys, and list of values
708 for blurp in cols[7].split(';'):
709 elts = blurp.split('=')
710 if len(elts) == 1: v = None
711 elif len(elts) == 2: v = elts[1]
712 else: self.error(line,self.ERROR_INFO_STRING)
713 info[elts[0]] = self.parse_formatdata(elts[0],
718 # Gracefully deal with absent FORMAT column
719 if cols[8] == "": format = []
720 else: format = cols[8].split(':')
722 # check: all filters are defined
724 if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f)
726 # check: format fields are defined
729 if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f)
731 # convert v3.3 alleles
732 if self._version == 33:
733 if len(ref) != 1: self.error(line,self.V33_BAD_REF)
735 have_deletions = False
737 if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference
738 elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference
739 elif a.startswith('D'): # allow D<seq> and D<num>
740 have_deletions = True
742 l = int(a[1:]) # throws ValueError if sequence
743 if len(ref) < l: # add to reference if necessary
744 addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
746 for i,na in enumerate(newalts): newalts[i] = na+addns
747 a = ref[l:] # new deletion, deleting pos...pos+l
750 if len(ref) < len(s): # add Ns to reference if necessary
751 addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
752 if not s.endswith(addns) and addns != 'N'*len(addns):
753 self.error(line,self.V33_UNMATCHED_DELETION,
754 "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
756 for i,na in enumerate(newalts): newalts[i] = na+addns
757 a = ref[len(s):] # new deletion, deleting from pos
759 self.error(line,self.V33_BAD_ALLELE)
762 # deletion alleles exist, add dummy 1st reference allele, and account for leading base
765 # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
766 addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
768 alt = [allele+addn for allele in alt]
770 addn = get_sequence(chrom,pos-1,pos,self._reference)
772 alt = [addn + allele for allele in alt]
775 # format v4.0 -- just check for nucleotides
777 if not alleleRegEx.match(allele):
778 self.error(line,self.V40_BAD_ALLELE,allele)
780 # check for leading nucleotide in indel calls
782 if len(allele) != len(ref):
783 if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE)
784 if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
785 self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE)
787 # trim trailing bases in alleles
789 for i in range(1,min(len(ref),min(map(len,alt)))):
790 if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
792 ref, alt = ref[:-1], [allele[:-1] for allele in alt]
794 # left-align alleles, if a reference is available
795 if self._leftalign and self._reference:
799 if len(allele) > len(ref):
800 longest, shortest = allele, ref
802 longest, shortest = ref, allele
803 if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
805 if longest[-1].upper() != longest[len(shortest)-1].upper():
810 alt = [allele[:-1] for allele in alt]
811 if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
812 ref = faref_leftflank[pos-left-1] + ref
813 alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
816 # parse sample columns
818 for sample in cols[9:]:
820 values = sample.split(':')
821 if len(values) > len(format):
822 self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format)))
823 for idx in range(len(format)):
824 expected = self.get_expected(format[idx], self._format, alt)
825 if idx < len(values): value = values[idx]
827 if expected == -1: value = "."
828 else: value = ",".join(["."]*expected)
830 dict[format[idx]] = self.parse_formatdata(format[idx],
834 if expected != -1 and len(dict[format[idx]]) != expected:
835 self.error(line,self.BAD_NUMBER_OF_PARAMETERS,
836 "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]]))
837 if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]]))
838 dict[format[idx]] = dict[format[idx]][:expected]
839 samples.append( dict )
843 'pos':pos, # return 0-based position
851 for key,value in zip(self._samples,samples):
857 def write_data(self, stream, data):
858 required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples
860 if k not in data: raise ValueError("Required key %s not found in data" % str(k))
861 if data['alt'] == []: alt = "."
862 else: alt = ",".join(data['alt'])
863 if data['filter'] == None: filter = "."
864 elif data['filter'] == []:
865 if self._version == 33: filter = "0"
866 else: filter = "PASS"
867 else: filter = ';'.join(data['filter'])
868 if data['qual'] == -1: qual = "."
869 else: qual = str(data['qual'])
871 output = [data['chrom'],
872 str(data['pos']+1), # change to 1-based position
878 self.format_formatdata( data['info'], self._info, separator=";" ),
879 self.format_formatdata( data['format'], self._format, value=False ) ]
881 for s in self._samples:
882 output.append( self.format_formatdata( data[s], self._format, key=False ) )
884 stream.write( "\t".join(output) + "\n" )
886 def _parse_header(self, stream):
890 if line.startswith('##'):
891 self.parse_header( line.strip() )
892 elif line.startswith('#'):
893 self.parse_heading( line.strip() )
894 self.enter_default_format()
899 def _parse(self, line, stream):
900 # deal with files with header only
901 if line.startswith("##"): return
902 if len(line.strip()) > 0:
903 d = self.parse_data( line.strip() )
907 if self._lines and self._lineno > self._lines: raise StopIteration
908 d = self.parse_data( line.strip() )
911 ######################################################################################################
915 ######################################################################################################
917 def getsamples(self):
918 """ List of samples in VCF file """
921 def setsamples(self,samples):
922 """ List of samples in VCF file """
923 self._samples = samples
926 """ List of header key-value pairs (strings) """
929 def setheader(self,header):
930 """ List of header key-value pairs (strings) """
931 self._header = header
934 """ Dictionary of ##INFO tags, as VCF.FORMAT values """
937 def setinfo(self,info):
938 """ Dictionary of ##INFO tags, as VCF.FORMAT values """
942 """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
945 def setformat(self,format):
946 """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
947 self._format = format
950 """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
953 def setfilter(self,filter):
954 """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
955 self._filter = filter
957 def setversion(self, version):
958 if version != 33 and version != 40: raise ValueError("Can only handle v3.3 and v4.0 VCF files")
959 self._version = version
961 def setregions(self, regions):
962 self._regions = regions
964 def setreference(self, ref):
965 """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """
966 self._reference = ref
968 def ignoreerror(self, errorstring):
969 try: self._ignored_errors.add(self.__dict__[errorstring])
970 except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
972 def warnerror(self, errorstring):
973 try: self._warn_errors.add(self.__dict__[errorstring])
974 except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
976 def parse(self, stream):
977 """ Parse a stream of VCF-formatted lines. Initializes class instance and return generator """
978 last_line = self._parse_header(stream)
979 # now return a generator that does the actual work. In this way the pre-processing is done
980 # before the first piece of data is yielded
981 return self._parse(last_line, stream)
983 def write(self, stream, datagenerator):
984 """ Writes a VCF file to a stream, using a data generator (or list) """
985 self.write_header(stream)
986 self.write_heading(stream)
987 for data in datagenerator: self.write_data(stream,data)
989 def writeheader(self, stream):
990 """ Writes a VCF header """
991 self.write_header(stream)
992 self.write_heading(stream)
994 def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2):
995 """ Utility function: compares two calls for equality """
996 # a variant should always be assigned to a unique position, one base before
997 # the leftmost position of the alignment gap. If this rule is implemented
998 # correctly, the two positions must be equal for the calls to be identical.
999 if pos1 != pos2: return False
1000 # from both calls, trim rightmost bases when identical. Do this safely, i.e.
1001 # only when the reference bases are not Ns
1002 while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]:
1005 while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]:
1008 # now, the alternative alleles must be identical
1011 ###########################################################################################################
1012 ###########################################################################################################
1013 ## API functions added by Andreas
1014 ###########################################################################################################
1016 def connect( self, filename ):
1017 '''connect to tabix file.'''
1018 self.tabixfile = pysam.Tabixfile( filename )
1019 self._parse_header(self.tabixfile.header)
1026 """ Parse a stream of VCF-formatted lines. Initializes class instance and return generator """
1028 return self.tabixfile.fetch( reference, start, end, region, parser = asVCFRecord( self ) )
1030 def validate( self, record ):
1031 '''validate vcf record.
1033 returns a validated record.
1036 raise NotImplementedError( "needs to be checked" )
1038 chrom, pos = record.chrom, record.pos
1043 self.error(str(record),self.MISSING_REF)
1044 if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
1048 if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF)
1049 if "N" in ref: ref = get_sequence(chrom,
1054 # make sure reference is sane
1056 left = max(0,self.pos-100)
1057 faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference)
1058 faref = faref_leftflank[pos-left:]
1059 if faref != ref: self.error(str(record),self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
1062 # check: format fields are defined
1063 for f in record.format:
1064 if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f)
1066 # check: all filters are defined
1067 for f in record.filter:
1068 if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f)
1070 # convert v3.3 alleles
1071 if self._version == 33:
1072 if len(ref) != 1: self.error(str(record),self.V33_BAD_REF)
1074 have_deletions = False
1076 if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference
1077 elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference
1078 elif a.startswith('D'): # allow D<seq> and D<num>
1079 have_deletions = True
1081 l = int(a[1:]) # throws ValueError if sequence
1082 if len(ref) < l: # add to reference if necessary
1083 addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
1085 for i,na in enumerate(newalts): newalts[i] = na+addns
1086 a = ref[l:] # new deletion, deleting pos...pos+l
1089 if len(ref) < len(s): # add Ns to reference if necessary
1090 addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
1091 if not s.endswith(addns) and addns != 'N'*len(addns):
1092 self.error(str(record),self.V33_UNMATCHED_DELETION,
1093 "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
1095 for i,na in enumerate(newalts): newalts[i] = na+addns
1096 a = ref[len(s):] # new deletion, deleting from pos
1098 self.error(str(record),self.V33_BAD_ALLELE)
1101 # deletion alleles exist, add dummy 1st reference allele, and account for leading base
1104 # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
1105 addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
1107 alt = [allele+addn for allele in alt]
1109 addn = get_sequence(chrom,pos-1,pos,self._reference)
1111 alt = [addn + allele for allele in alt]
1114 # format v4.0 -- just check for nucleotides
1116 if not alleleRegEx.match(allele):
1117 self.error(str(record),self.V40_BAD_ALLELE,allele)
1120 # check for leading nucleotide in indel calls
1122 if len(allele) != len(ref):
1123 if len(allele) == 0: self.error(str(record),self.ZERO_LENGTH_ALLELE)
1124 if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
1125 self.error(str(record),self.MISSING_INDEL_ALLELE_REF_BASE)
1127 # trim trailing bases in alleles
1128 for i in range(1,min(len(ref),min(map(len,alt)))):
1129 if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
1131 ref, alt = ref[:-1], [allele[:-1] for allele in alt]
1133 # left-align alleles, if a reference is available
1134 if self._leftalign and self._reference:
1138 if len(allele) > len(ref):
1139 longest, shortest = allele, ref
1141 longest, shortest = ref, allele
1142 if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
1144 if longest[-1].upper() != longest[len(shortest)-1].upper():
1149 alt = [allele[:-1] for allele in alt]
1150 if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
1151 ref = faref_leftflank[pos-left-1] + ref
1152 alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
1156 "VCF", "VCFRecord", ]