1 # cython: embedsignature=True
2 # adds doc-strings for sphinx
4 import tempfile, os, sys, types, itertools, struct, ctypes, gzip
5 from cpython cimport PyString_FromStringAndSize, PyString_AS_STRING
9 '''*(filename, mode='r')*
11 opens a :term:`tabix file` for reading. A missing
12 index (*filename* + ".tbi") will raise an exception.
15 def __cinit__(self, *args, **kwargs ):
17 self._open( *args, **kwargs )
20 '''return true if samfile has been opened.'''
21 return self.tabixfile != NULL
27 '''open a :term:`tabix file` for reading.
30 assert mode in ( "r",), "invalid file opening mode `%s`" % mode
32 # close a previously opened file
33 if self.tabixfile != NULL: self.close()
36 self.filename = filename
37 filename_index = filename + ".tbi"
40 # open file for writing
44 # open file for reading
45 if not os.path.exists( self.filename ):
46 raise IOError( "file `%s` not found" % self.filename)
48 if not os.path.exists( filename_index ):
49 raise IOError( "index `%s` not found" % filename_index)
51 # open file and load index
52 self.tabixfile = ti_open( self.filename, filename_index )
54 if self.tabixfile == NULL:
55 raise IOError("could not open file `%s`" % filename )
57 def _parseRegion( self,
62 '''parse region information.
64 raise ValueError for for invalid regions.
66 returns a tuple of region, tid, start and end. Region
67 is a valid samtools :term:`region` or None if the region
68 extends over the whole file.
70 Note that regions are 1-based, while start,end are python coordinates.
72 ti_lazy_index_load( self.tabixfile )
80 rtid = rstart = rend = 0
82 # translate to a region
84 if start != None and end != None:
85 region = "%s:%i-%i" % (reference, start+1, end)
86 elif start == None and end != None:
87 region = "%s:%i-%i" % (reference, 1, end)
88 elif end == None and start != None:
89 region = "%s:%i-%i" % (reference, start+1, max_pos-1)
94 ti_parse_region( self.tabixfile.idx, region, &rtid, &rstart, &rend)
95 if rtid < 0: raise ValueError( "invalid region `%s`" % region )
96 if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
97 if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
98 if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend )
100 return region, rtid, rstart, rend
110 fetch one or more rows in a :term:`region` using 0-based indexing. The region is specified by
111 :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
113 Without *reference* or *region* all entries will be fetched.
115 If only *reference* is set, all reads matching on *reference* will be fetched.
117 If *parser* is None, the results are returned as an unparsed string.
118 Otherwise, *parser* is assumed to be a functor that will return parsed
119 data (see for example :meth:`asTuple` and :meth:`asGTF`).
121 ti_lazy_index_load( self.tabixfile )
123 if not self._isOpen():
124 raise ValueError( "I/O operation on closed file" )
126 region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
130 return TabixIterator( self, rtid, rstart, rend )
132 return TabixIterator( self, -1, 0, 0 )
135 return TabixIteratorParsed( self, rtid, rstart, rend, parser )
137 return TabixIteratorParsed( self, -1, 0, 0, parser )
141 '''return header lines as an iterator.
143 Note that the header lines do not contain the newline '\n' character.
145 return TabixHeaderIterator( self )
148 '''chromosome names'''
150 cdef char ** sequences
153 ti_lazy_index_load( self.tabixfile )
154 sequences = ti_seqname( self.tabixfile.idx, &nsequences )
157 for x from 0 <= x < nsequences:
158 result.append( sequences[x] )
161 cdef class TabixIterator:
162 """iterates over rows in *tabixfile* in region
163 given by *tid*, *start* and *end*.
166 cdef ti_iter_t iterator
167 cdef tabix_t * tabixfile
169 def __cinit__(self, Tabixfile tabixfile,
170 int tid, int start, int end ):
172 assert tabixfile._isOpen()
174 # makes sure that samfile stays alive as long as the
176 self.tabixfile = tabixfile.tabixfile
179 # seek to start of file to ensure iteration is over
181 bgzf_seek( self.tabixfile.fp, 0, 0)
182 self.iterator = ti_iter_first()
184 self.iterator = ti_queryi(self.tabixfile, tid, start, end)
186 if <void*>self.iterator == NULL:
187 raise ValueError("malformatted query or wrong sequence name.\n")
193 """python version of next().
195 pyrex uses this non-standard name instead of next()
200 # metachar filtering does not work within tabix
201 # though it should. Getting the metachar is a pain
202 # as ti_index_t is incomplete type.
204 # simply use '#' for now.
206 s = ti_read(self.tabixfile, self.iterator, &len)
207 if s == NULL: raise StopIteration
208 if s[0] != '#': break
212 def __dealloc__(self):
213 if <void*>self.iterator != NULL:
214 ti_iter_destroy(self.iterator)
216 cdef class TabixHeaderIterator:
217 """return header lines.
220 cdef ti_iter_t iterator
221 cdef tabix_t * tabixfile
223 def __cinit__(self, Tabixfile tabixfile ):
225 assert tabixfile._isOpen()
227 # makes sure that samfile stays alive as long as the
229 self.tabixfile = tabixfile.tabixfile
231 self.iterator = ti_query(self.tabixfile, NULL, 0, 0)
233 if <void*>self.iterator == NULL:
234 raise ValueError("can't open header.\n")
240 """python version of next().
242 pyrex uses this non-standard name instead of next()
248 # Getting the metachar is a pain as ti_index_t is incomplete type.
249 # simply use '#' for now.
250 s = ti_read(self.tabixfile, self.iterator, &len)
251 if s == NULL: raise StopIteration
252 # stop at first non-header line
253 if s[0] != '#': raise StopIteration
257 def __dealloc__(self):
258 if <void*>self.iterator != NULL:
259 ti_iter_destroy(self.iterator)
261 #########################################################
262 #########################################################
263 #########################################################
267 cdef class asTuple(Parser):
268 '''converts a :term:`tabix row` into a python tuple.'''
269 def __call__(self, char * buffer, int len):
270 cdef TabProxies.TupleProxy r
271 r = TabProxies.TupleProxy()
272 # need to copy - there were some
273 # persistence issues with "present"
274 r.copy( buffer, len )
277 cdef class asGTF(Parser):
278 '''converts a :term:`tabix row` into a GTF record.'''
279 def __call__(self, char * buffer, int len):
280 cdef TabProxies.GTFProxy r
281 r = TabProxies.GTFProxy()
282 r.copy( buffer, len )
285 cdef class asBed( Parser ):
286 '''converts a :term:`tabix row` into a GTF record.'''
287 def __call__(self, char * buffer, int len):
288 cdef TabProxies.BedProxy r
289 r = TabProxies.BedProxy()
290 r.copy( buffer, len )
293 cdef class asVCF( Parser ):
294 '''converts a :term:`tabix row` into a VCF record.'''
295 def __call__(self, char * buffer, int len ):
296 cdef TabProxies.VCFProxy r
297 r = TabProxies.VCFProxy()
298 r.copy( buffer, len )
301 #########################################################
302 #########################################################
303 #########################################################
304 cdef class TabixIteratorParsed:
305 """iterates over mapped reads in a region.
310 cdef ti_iter_t iterator
311 cdef tabix_t * tabixfile
321 assert tabixfile._isOpen()
324 # makes sure that samfile stays alive as long as the
326 self.tabixfile = tabixfile.tabixfile
329 # seek to start of file to ensure iteration is over
331 bgzf_seek( self.tabixfile.fp, 0, 0)
332 self.iterator = ti_iter_first()
334 self.iterator = ti_queryi(self.tabixfile, tid, start, end)
336 if <void*>self.iterator == NULL:
337 raise ValueError("malformatted query or wrong sequence name.\n")
343 """python version of next().
345 pyrex uses this non-standard name instead of next()
351 s = ti_read(self.tabixfile, self.iterator, &len)
352 if s == NULL: raise StopIteration
353 # todo: read metachar from configuration
354 if s[0] != '#': break
356 return self.parser(s, len)
358 def __dealloc__(self):
359 if <void*>self.iterator != NULL:
360 ti_iter_destroy(self.iterator)
362 def tabix_compress( filename_in,
367 compress *filename_in* writing the output to *filename_out*.
369 Raise an IOError if *filename_out* already exists, unless *force* is set.
372 if not force and os.path.exists(filename_out ):
373 raise IOError( "Filename '%s' already exists, use *force* to overwrite" % filename_out)
382 O_RDONLY = os.O_RDONLY
384 WINDOW_SIZE = 64 * 1024
386 fp = bgzf_open( filename_out, "w")
388 raise IOError( "could not open '%s' for writing" )
390 fd_src = open(filename_in, O_RDONLY)
392 raise IOError( "could not open '%s' for reading" )
394 buffer = malloc(WINDOW_SIZE)
397 c = read(fd_src, buffer, WINDOW_SIZE)
398 r = bgzf_write(fp, buffer, c)
401 raise OSError("writing failed")
405 if r < 0: raise OSError("writing failed")
407 def tabix_index( filename,
417 index tab-separated *filename* using tabix.
419 An existing index will not be overwritten unless
422 The index will be built from coordinates
423 in columns *seq_col*, *start_col* and *end_col*.
425 The contents of *filename* have to be sorted by
426 contig and position - the method does not check
427 if the file is sorted.
429 Column indices are 0-based. Coordinates in the file
430 are assumed to be 1-based.
432 If *preset* is provided, the column coordinates
433 are taken from a preset. Valid values for preset
434 are "gff", "bed", "sam", "vcf", psltbl", "pileup".
436 Lines beginning with *meta_char* and the first
437 *line_skip* lines will be skipped.
439 If *filename* does not end in ".gz", it will be automatically
440 compressed. The original file will be removed and only the
441 compressed file will be retained.
443 If *filename* ends in *gz*, the file is assumed to be already
444 compressed with bgzf.
446 returns the filename of the compressed data
449 if not os.path.exists(filename): raise IOError("No such file '%s'" % filename)
451 if not filename.endswith(".gz"):
453 tabix_compress( filename, filename + ".gz", force = force )
454 os.unlink( filename )
457 if not force and os.path.exists(filename + ".tbi" ):
458 raise IOError( "Filename '%s.tbi' already exists, use *force* to overwrite" )
461 # preset-code, contig, start, end, metachar for commends, lines to ignore at beginning
462 # 0 is a missing column
464 'gff' : ( 0, 1, 4, 5, ord('#'), 0 ),
465 'bed' : ( 0x10000, 1, 2, 3, ord('#'), 0 ),
466 'psltbl' : ( 0x10000, 15, 17, 18, ord('#'), 0 ),
467 'sam' : ( 1, 3, 4, 0, ord('#'), 0 ),
468 'vcf' : ( 2, 1, 2, 0, ord('#'), 0 ),
469 'pileup': (3, 1, 2, 0, ord('#'), 0 ),
474 conf_data = preset2conf[preset]
476 raise KeyError( "unknown preset '%s', valid presets are '%s'" % (preset, ",".join(preset2conf.keys() )))
478 if end_col == None: end_col = -1
481 # note that tabix internally works with 0-based coordinates and open/closed intervals.
482 # When using a preset, conversion is automatically taken care of.
483 # Otherwise, the coordinates are assumed to be 1-based closed intervals and
484 # -1 is subtracted from the start coordinate. To avoid doing this, set
485 # the TI_FLAG_UCSC=0x10000 flag:
486 if zerobased: preset = preset | 0x10000
488 conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0)
491 conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
493 ti_index_build( filename, &conf)
497 __all__ = ["tabix_index",