1 # cython: embedsignature=True
2 # adds doc-strings for sphinx
4 import tempfile, os, sys, types, itertools, struct, ctypes, gzip
5 from cpython cimport PyString_FromStringAndSize, PyString_AS_STRING
9 '''*(filename, mode='r')*
11 opens a :term:`tabix file` for reading. A missing
12 index (*filename* + ".tbi") will raise an exception.
15 def __cinit__(self, *args, **kwargs ):
17 self._open( *args, **kwargs )
20 '''return true if samfile has been opened.'''
21 return self.tabixfile != NULL
27 '''open a :term:`tabix file` for reading.
30 assert mode in ( "r",), "invalid file opening mode `%s`" % mode
32 # close a previously opened file
33 if self.tabixfile != NULL: self.close()
36 if self._filename != NULL: free(self._filename )
37 self._filename = strdup( filename )
39 filename_index = filename + ".tbi"
42 # open file for writing
46 # open file for reading
47 if not os.path.exists( self._filename ):
48 raise IOError( "file `%s` not found" % self._filename)
50 if not os.path.exists( filename_index ):
51 raise IOError( "index `%s` not found" % filename_index)
53 # open file and load index
54 self.tabixfile = ti_open( self._filename, filename_index )
56 if self.tabixfile == NULL:
57 raise IOError("could not open file `%s`" % filename )
59 def _parseRegion( self,
64 '''parse region information.
66 raise ValueError for for invalid regions.
68 returns a tuple of region, tid, start and end. Region
69 is a valid samtools :term:`region` or None if the region
70 extends over the whole file.
72 Note that regions are 1-based, while start,end are python coordinates.
74 ti_lazy_index_load( self.tabixfile )
82 rtid = rstart = rend = 0
84 # translate to a region
86 if start != None and end != None:
87 region = "%s:%i-%i" % (reference, start+1, end)
88 elif start == None and end != None:
89 region = "%s:%i-%i" % (reference, 1, end)
90 elif end == None and start != None:
91 region = "%s:%i-%i" % (reference, start+1, max_pos-1)
96 ti_parse_region( self.tabixfile.idx, region, &rtid, &rstart, &rend)
97 if rtid < 0: raise ValueError( "invalid region `%s`" % region )
98 if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
99 if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
100 if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend )
102 return region, rtid, rstart, rend
112 fetch one or more rows in a :term:`region` using 0-based indexing. The region is specified by
113 :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
115 Without *reference* or *region* all entries will be fetched.
117 If only *reference* is set, all reads matching on *reference* will be fetched.
119 If *parser* is None, the results are returned as an unparsed string.
120 Otherwise, *parser* is assumed to be a functor that will return parsed
121 data (see for example :meth:`asTuple` and :meth:`asGTF`).
123 ti_lazy_index_load( self.tabixfile )
125 if not self._isOpen():
126 raise ValueError( "I/O operation on closed file" )
128 region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
132 return TabixIterator( self, rtid, rstart, rend )
134 return TabixIterator( self, -1, 0, 0 )
137 return TabixIteratorParsed( self, rtid, rstart, rend, parser )
139 return TabixIteratorParsed( self, -1, 0, 0, parser )
141 ###############################################################
142 ###############################################################
143 ###############################################################
145 ###############################################################
147 '''filename associated with this object.'''
149 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
150 return self._filename
156 The header is returned as an iterator over lines without the
161 return TabixHeaderIterator( self )
164 '''chromosome names'''
166 cdef char ** sequences
169 ti_lazy_index_load( self.tabixfile )
170 sequences = ti_seqname( self.tabixfile.idx, &nsequences )
173 for x from 0 <= x < nsequences:
174 result.append( sequences[x] )
179 closes the :class:`pysam.Tabixfile`.'''
180 if self.tabixfile != NULL:
181 ti_close( self.tabixfile )
182 self.tabixfile = NULL
184 def __dealloc__( self ):
185 # remember: dealloc cannot call other python methods
186 # note: no doc string
187 # note: __del__ is not called.
188 if self.tabixfile != NULL:
189 ti_close( self.tabixfile )
190 self.tabixfile = NULL
191 if self._filename != NULL: free( self._filename )
193 cdef class TabixIterator:
194 """iterates over rows in *tabixfile* in region
195 given by *tid*, *start* and *end*.
198 cdef ti_iter_t iterator
199 cdef tabix_t * tabixfile
201 def __cinit__(self, Tabixfile tabixfile,
202 int tid, int start, int end ):
204 assert tabixfile._isOpen()
206 # makes sure that samfile stays alive as long as the
208 self.tabixfile = tabixfile.tabixfile
211 # seek to start of file to ensure iteration is over
213 bgzf_seek( self.tabixfile.fp, 0, 0)
214 self.iterator = ti_iter_first()
216 self.iterator = ti_queryi(self.tabixfile, tid, start, end)
218 if <void*>self.iterator == NULL:
219 raise ValueError("malformatted query or wrong sequence name.\n")
225 """python version of next().
227 pyrex uses this non-standard name instead of next()
232 # metachar filtering does not work within tabix
233 # though it should. Getting the metachar is a pain
234 # as ti_index_t is incomplete type.
236 # simply use '#' for now.
238 s = ti_read(self.tabixfile, self.iterator, &len)
239 if s == NULL: raise StopIteration
240 if s[0] != '#': break
244 def __dealloc__(self):
245 if <void*>self.iterator != NULL:
246 ti_iter_destroy(self.iterator)
248 cdef class TabixHeaderIterator:
249 """return header lines.
252 cdef ti_iter_t iterator
253 cdef tabix_t * tabixfile
255 def __cinit__(self, Tabixfile tabixfile ):
257 assert tabixfile._isOpen()
259 # makes sure that samfile stays alive as long as the
261 self.tabixfile = tabixfile.tabixfile
263 self.iterator = ti_query(self.tabixfile, NULL, 0, 0)
265 if <void*>self.iterator == NULL:
266 raise ValueError("can't open header.\n")
272 """python version of next().
274 pyrex uses this non-standard name instead of next()
280 # Getting the metachar is a pain as ti_index_t is incomplete type.
281 # simply use '#' for now.
282 s = ti_read(self.tabixfile, self.iterator, &len)
283 if s == NULL: raise StopIteration
284 # stop at first non-header line
285 if s[0] != '#': raise StopIteration
289 def __dealloc__(self):
290 if <void*>self.iterator != NULL:
291 ti_iter_destroy(self.iterator)
293 #########################################################
294 #########################################################
295 #########################################################
299 cdef class asTuple(Parser):
300 '''converts a :term:`tabix row` into a python tuple.
302 Access is by numeric index.
304 def __call__(self, char * buffer, int len):
305 cdef TabProxies.TupleProxy r
306 r = TabProxies.TupleProxy()
307 # need to copy - there were some
308 # persistence issues with "present"
309 r.copy( buffer, len )
312 cdef class asGTF(Parser):
313 '''converts a :term:`tabix row` into a GTF record with the following
323 genomic start coordinate (0-based)
325 genomic end coordinate plus one (0-based)
335 GTF formatted entries also defined the attributes:
340 the transcript identifier
343 def __call__(self, char * buffer, int len):
344 cdef TabProxies.GTFProxy r
345 r = TabProxies.GTFProxy()
346 r.copy( buffer, len )
349 cdef class asBed( Parser ):
350 '''converts a :term:`tabix row` into a bed record
351 with the following fields:
356 genomic start coordinate (zero-based)
358 genomic end coordinate plus one (zero-based)
374 ',' separated string of block sizes
376 ',' separated string of block genomic start positions
378 Only the first three fields are required. Additional
379 fields are optional, but if one is defined, all the preceeding
380 need to be defined as well.
383 def __call__(self, char * buffer, int len):
384 cdef TabProxies.BedProxy r
385 r = TabProxies.BedProxy()
386 r.copy( buffer, len )
389 cdef class asVCF( Parser ):
390 '''converts a :term:`tabix row` into a VCF record with
391 the following fields:
396 chromosomal position, zero-based
412 Access to genotypes is via index::
415 first_sample_genotype = vcf[0]
416 second_sample_genotype = vcf[1]
419 def __call__(self, char * buffer, int len ):
420 cdef TabProxies.VCFProxy r
421 r = TabProxies.VCFProxy()
422 r.copy( buffer, len )
425 #########################################################
426 #########################################################
427 #########################################################
428 cdef class TabixIteratorParsed:
429 """iterates over mapped reads in a region.
434 cdef ti_iter_t iterator
435 cdef tabix_t * tabixfile
445 assert tabixfile._isOpen()
448 # makes sure that samfile stays alive as long as the
450 self.tabixfile = tabixfile.tabixfile
453 # seek to start of file to ensure iteration is over
455 bgzf_seek( self.tabixfile.fp, 0, 0)
456 self.iterator = ti_iter_first()
458 self.iterator = ti_queryi(self.tabixfile, tid, start, end)
460 if <void*>self.iterator == NULL:
461 raise ValueError("malformatted query or wrong sequence name.\n")
467 """python version of next().
469 pyrex uses this non-standard name instead of next()
475 s = ti_read(self.tabixfile, self.iterator, &len)
476 if s == NULL: raise StopIteration
477 # todo: read metachar from configuration
478 if s[0] != '#': break
480 return self.parser(s, len)
482 def __dealloc__(self):
483 if <void*>self.iterator != NULL:
484 ti_iter_destroy(self.iterator)
486 def tabix_compress( filename_in,
491 compress *filename_in* writing the output to *filename_out*.
493 Raise an IOError if *filename_out* already exists, unless *force* is set.
496 if not force and os.path.exists(filename_out ):
497 raise IOError( "Filename '%s' already exists, use *force* to overwrite" % filename_out)
506 O_RDONLY = os.O_RDONLY
508 WINDOW_SIZE = 64 * 1024
510 fp = bgzf_open( filename_out, "w")
512 raise IOError( "could not open '%s' for writing" )
514 fd_src = open(filename_in, O_RDONLY)
516 raise IOError( "could not open '%s' for reading" )
518 buffer = malloc(WINDOW_SIZE)
521 c = read(fd_src, buffer, WINDOW_SIZE)
522 r = bgzf_write(fp, buffer, c)
525 raise OSError("writing failed")
529 if r < 0: raise OSError("writing failed")
531 def tabix_index( filename,
541 index tab-separated *filename* using tabix.
543 An existing index will not be overwritten unless
546 The index will be built from coordinates
547 in columns *seq_col*, *start_col* and *end_col*.
549 The contents of *filename* have to be sorted by
550 contig and position - the method does not check
551 if the file is sorted.
553 Column indices are 0-based. Coordinates in the file
554 are assumed to be 1-based.
556 If *preset* is provided, the column coordinates
557 are taken from a preset. Valid values for preset
558 are "gff", "bed", "sam", "vcf", psltbl", "pileup".
560 Lines beginning with *meta_char* and the first
561 *line_skip* lines will be skipped.
563 If *filename* does not end in ".gz", it will be automatically
564 compressed. The original file will be removed and only the
565 compressed file will be retained.
567 If *filename* ends in *gz*, the file is assumed to be already
568 compressed with bgzf.
570 returns the filename of the compressed data
573 if not os.path.exists(filename): raise IOError("No such file '%s'" % filename)
575 if not filename.endswith(".gz"):
577 tabix_compress( filename, filename + ".gz", force = force )
578 os.unlink( filename )
581 if not force and os.path.exists(filename + ".tbi" ):
582 raise IOError( "Filename '%s.tbi' already exists, use *force* to overwrite" )
585 # preset-code, contig, start, end, metachar for commends, lines to ignore at beginning
586 # 0 is a missing column
588 'gff' : ( 0, 1, 4, 5, ord('#'), 0 ),
589 'bed' : ( 0x10000, 1, 2, 3, ord('#'), 0 ),
590 'psltbl' : ( 0x10000, 15, 17, 18, ord('#'), 0 ),
591 'sam' : ( 1, 3, 4, 0, ord('#'), 0 ),
592 'vcf' : ( 2, 1, 2, 0, ord('#'), 0 ),
593 'pileup': (3, 1, 2, 0, ord('#'), 0 ),
598 conf_data = preset2conf[preset]
600 raise KeyError( "unknown preset '%s', valid presets are '%s'" % (preset, ",".join(preset2conf.keys() )))
602 if end_col == None: end_col = -1
605 # note that tabix internally works with 0-based coordinates and open/closed intervals.
606 # When using a preset, conversion is automatically taken care of.
607 # Otherwise, the coordinates are assumed to be 1-based closed intervals and
608 # -1 is subtracted from the start coordinate. To avoid doing this, set
609 # the TI_FLAG_UCSC=0x10000 flag:
610 if zerobased: preset = preset | 0x10000
612 conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0)
615 conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
617 ti_index_build( filename, &conf)
621 __all__ = ["tabix_index",