1 # cython: embedsignature=True
2 # adds doc-strings for sphinx
4 import tempfile, os, sys, types, itertools, struct, ctypes
7 '''*(filename, mode='r')*
9 opens a :term:`tabix file` for reading. A missing
10 index (*filename* + ".tbi") will raise an exception.
15 # pointer to tabixfile
16 cdef tabix_t * tabixfile
18 def __cinit__(self, *args, **kwargs ):
20 self._open( *args, **kwargs )
23 '''return true if samfile has been opened.'''
24 return self.tabixfile != NULL
30 '''open a :term:`tabix file` for reading.
33 assert mode in ( "r",), "invalid file opening mode `%s`" % mode
35 # close a previously opened file
36 if self.tabixfile != NULL: self.close()
39 self.filename = filename
40 filename_index = filename + ".tbi"
43 # open file for writing
47 # open file for reading
48 if not os.path.exists( self.filename ):
49 raise IOError( "file `%s` not found" % self.filename)
51 if not os.path.exists( filename_index ):
52 raise IOError( "index `%s` not found" % filename_index)
54 # open file and load index
55 self.tabixfile = ti_open( self.filename, filename_index )
57 if self.tabixfile == NULL:
58 raise IOError("could not open file `%s`" % filename )
60 def _parseRegion( self,
65 '''parse region information.
67 raise ValueError for for invalid regions.
69 returns a tuple of region, tid, start and end. Region
70 is a valid samtools :term:`region` or None if the region
71 extends over the whole file.
73 Note that regions are 1-based, while start,end are python coordinates.
75 ti_lazy_index_load( self.tabixfile )
83 rtid = rstart = rend = 0
85 # translate to a region
87 if start != None and end != None:
88 region = "%s:%i-%i" % (reference, start+1, end)
89 elif start == None and end != None:
90 region = "%s:%i-%i" % (reference, 1, end)
91 elif end == None and start != None:
92 region = "%s:%i-%i" % (reference, start+1, max_pos-1)
97 ti_parse_region( self.tabixfile.idx, region, &rtid, &rstart, &rend)
98 if rtid < 0: raise ValueError( "invalid region `%s`" % region )
99 if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
100 if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
101 if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend )
103 return region, rtid, rstart, rend
113 fetch one or more rows in a :term:`region` using 0-based indexing. The region is specified by
114 :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
116 Without *reference* or *region* all entries will be fetched.
118 If only *reference* is set, all reads matching on *reference* will be fetched.
120 If *parser* is None, the results are returned as an unparsed string.
121 Otherwise, *parser* is assumed to be a functor that will return parsed
122 data (see for example :meth:`asTuple` and :meth:`asGTF`).
124 ti_lazy_index_load( self.tabixfile )
126 if not self._isOpen():
127 raise ValueError( "I/O operation on closed file" )
129 region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
133 return TabixIterator( self, rtid, rstart, rend )
135 return TabixIterator( self, -1, 0, 0 )
138 return TabixIteratorParsed( self, rtid, rstart, rend, parser )
140 return TabixIteratorParsed( self, -1, 0, 0, parser )
143 '''chromosome names'''
145 cdef char ** sequences
148 ti_lazy_index_load( self.tabixfile )
149 sequences = ti_seqname( self.tabixfile.idx, &nsequences )
152 for x from 0 <= x < nsequences:
153 result.append( sequences[x] )
156 cdef class TabixIterator:
157 """iterates over rows in *tabixfile* in region
158 given by *tid*, *start* and *end*.
161 cdef ti_iter_t iterator
162 cdef tabix_t * tabixfile
164 def __cinit__(self, Tabixfile tabixfile,
165 int tid, int start, int end ):
167 assert tabixfile._isOpen()
169 # makes sure that samfile stays alive as long as the
171 self.tabixfile = tabixfile.tabixfile
174 # seek to start of file to ensure iteration is over
176 bgzf_seek( self.tabixfile.fp, 0, 0)
177 self.iterator = ti_iter_first()
179 self.iterator = ti_queryi(self.tabixfile, tid, start, end)
181 if <void*>self.iterator == NULL:
182 raise ValueError("malformatted query or wrong sequence name.\n")
188 """python version of next().
190 pyrex uses this non-standard name instead of next()
195 s = ti_read(self.tabixfile, self.iterator, &len)
196 if s == NULL: raise StopIteration
199 def __dealloc__(self):
200 if <void*>self.iterator != NULL:
201 ti_iter_destroy(self.iterator)
204 '''convert value to '.' if None'''
205 if v == None: return "."
209 '''return a quoted attribute.'''
210 if type(v) in types.StringTypes:
215 cdef class TupleProxy:
216 '''Proxy class for access to parsed row as a tuple.
218 This class represents a table row for fast read-access.
227 def __cinit__(self ):
233 cdef take( self, char * buffer, size_t nbytes ):
234 '''start presenting buffer.
236 Take ownership of the pointer.
239 self.update( buffer, nbytes )
241 cdef present( self, char * buffer, size_t nbytes ):
242 '''start presenting buffer.
244 Do not take ownership of the pointer.
246 self.update( buffer, nbytes )
248 cdef copy( self, char * buffer, size_t nbytes ):
249 '''start presenting buffer.
251 Take a copy of buffer.
255 s = sizeof(char) * (nbytes + 1)
256 self.data = <char*>malloc( s )
257 memcpy( <char*>self.data, buffer, s )
258 self.update( self.data, nbytes )
260 cdef update( self, char * buffer, size_t nbytes ):
261 '''update internal data.'''
268 if buffer[nbytes] != 0:
269 raise ValueError( "incomplete line at %s" % buffer )
271 if self.fields != NULL:
274 max_fields = nbytes / 4
275 self.fields = <char **>calloc( max_fields, sizeof(char *) )
284 pos = <char*>memchr( pos, '\t', nbytes )
285 if pos == NULL: break
288 self.fields[field] = pos
290 if field >= max_fields:
291 raise ValueError("row too large - more than %i fields" % max_fields )
292 nbytes -= pos - old_pos
298 def __getitem__( self, key ):
302 if i < 0: i += self.nfields
303 if i >= self.nfields or i < 0:
304 raise IndexError( "list index out of range" )
305 return self.fields[i]
310 def __dealloc__(self):
311 if self.data != NULL:
319 """python version of next().
321 if self.index >= self.nfields:
324 return self.fields[self.index-1]
327 '''Proxy class for access to GTF fields.
329 This class represents a GTF entry for fast read-access.
330 Write-access has been added as well, though some care must
331 be taken. If any of the string fields (contig, source, ...)
332 are set, the new value is tied to the lifetime of the
333 argument that was supplied.
335 The only exception is the attributes field when set from
336 a dictionary - this field will manage its own memory.
353 cdef bint hasOwnAttributes
355 def __cinit__(self ):
357 self.isModified = False
358 self.hasOwnAttributes = False
360 cdef take( self, char * buffer, size_t nbytes ):
361 '''start presenting buffer.
363 Take ownership of the pointer.
366 self.update( buffer, nbytes )
367 self.isModified = False
369 cdef present( self, char * buffer, size_t nbytes ):
370 '''start presenting buffer.
372 Do not take ownership of the pointer.
374 self.update( buffer, nbytes )
375 self.isModified = False
377 cdef copy( self, char * buffer, size_t nbytes ):
378 '''start presenting buffer.
380 Take a copy of buffer.
384 s = sizeof(char) * (nbytes + 1)
385 self.data = <char*>malloc( s )
386 memcpy( <char*>self.data, buffer, s )
387 self.update( self.data, nbytes )
388 self.isModified = False
390 cdef update( self, char * buffer, size_t nbytes ):
391 '''update internal data.
393 nbytes does not include the terminal '\0'.
396 cdef char * cstart, * cend, * cscore
401 if buffer[nbytes] != 0:
402 raise ValueError( "incomplete line at %s" % buffer )
404 pos = strchr( buffer, '\t' )
405 if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
410 pos = strchr( pos, '\t' )
411 if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
416 pos = strchr( pos, '\t' )
417 if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
422 pos = strchr( pos, '\t' )
423 if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
428 pos = strchr( pos, '\t' )
429 if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
434 pos = strchr( pos, '\t' )
435 if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
440 pos = strchr( pos, '\t' )
441 if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
446 pos = strchr( pos, '\t' )
447 if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
450 self.attributes = pos
451 self.start = atoi( cstart ) - 1
452 self.end = atoi( cend )
455 '''contig of feature.'''
456 def __get__( self ): return self.contig
457 def __set__( self, value ):
458 self.isModified = True
463 def __get__( self ): return self.feature
464 def __set__( self, value ):
465 self.isModified = True
469 '''feature source.'''
470 def __get__( self ): return self.source
471 def __set__( self, value ):
472 self.isModified = True
476 '''feature start (in 0-based open/closed coordinates).'''
477 def __get__( self ): return self.start
478 def __set__( self, value ):
479 self.isModified = True
483 '''feature end (in 0-based open/closed coordinates).'''
484 def __get__( self ): return self.end
485 def __set__( self, value ):
486 self.isModified = True
492 if self.score[0] == '.' and self.score[1] == '\0' :
495 return atof(self.score)
496 def __set__( self, value ):
497 self.isModified = True
501 '''feature strand.'''
502 def __get__( self ): return self.strand
503 def __set__( self, value ):
504 self.isModified = True
509 def __get__( self ): return self.frame
510 def __set__( self, value ):
511 self.isModified = True
515 '''feature attributes (as a string).'''
516 def __get__( self ): return self.attributes
517 def __set__( self, value ):
518 self.isModified = True
519 self.attributes = value
522 """parse attributes - return as dict
526 attributes = self.attributes
528 # separate into fields
529 fields = [ x.strip() for x in attributes.split(";")[:-1]]
535 d = [ x.strip() for x in f.split(" ")]
538 if len(d) > 2: v = d[1:]
540 if v[0] == '"' and v[-1] == '"':
543 ## try to convert to a value
556 def fromDict( self, d ):
557 '''set attributes from a dictionary.'''
561 # clean up if this field is set twice
562 if self.hasOwnAttributes:
563 free(self.attributes)
566 for k,v in d.items():
567 if type(v) == types.StringType:
568 aa.append( '%s "%s"' % (k,v) )
570 aa.append( '%s %s' % (k,str(v)) )
572 a = "; ".join( aa ) + ";"
575 self.attributes = <char *>calloc( l + 1, sizeof(char) )
576 memcpy( self.attributes, p, l )
578 self.hasOwnAttributes = True
579 self.isModified = True
597 cpy = <char*>calloc( sizeof(char), self.nbytes+1 )
598 memcpy( cpy, self.data, self.nbytes+1)
599 for x from 0 <= x < self.nbytes:
600 if cpy[x] == '\0': cpy[x] = '\t'
605 def invert( self, int lcontig ):
606 '''invert coordinates to negative strand coordinates
608 This method will only act if the feature is on the
611 if self.strand[0] == '-':
612 start = min(self.start, self.end)
613 end = max(self.start, self.end)
614 self.start, self.end = lcontig - end, lcontig - start
617 '''return a list of attributes defined in this entry.'''
619 return [ x.strip().split(" ")[0] for x in r.split(";") if x.strip() != '' ]
621 def __getitem__(self, item):
622 return self.__getattr__( item )
624 def __dealloc__(self):
625 if self.data != NULL:
627 if self.hasOwnAttributes:
628 free(self.attributes)
630 def __getattr__(self, item ):
631 """Generic lookup of attribute from GFF/GTF attributes
632 Only called if there *isn't* an attribute with this name
641 start = strstr( self.attributes, query)
643 raise AttributeError("'GTFProxy' has no attribute '%s'" % item )
645 start += strlen(query) + 1
647 while start[0] == " ": start += 1
651 while end[0] != '\0' and end[0] != '"': end += 1
653 cpy = <char*>calloc( l, sizeof(char ) )
654 memcpy( cpy, start, l )
662 def setAttribute( self, name, value ):
663 '''convenience method to set an attribute.'''
671 cdef class asTuple(Parser):
672 '''converts a :term:`tabix row` into a python tuple.'''
673 def __call__(self, char * buffer, int len):
676 # need to copy - there were some
677 # persistence issues with "present"
678 r.copy( buffer, len )
681 cdef class asGTF(Parser):
682 '''converts a :term:`tabix row` into a GTF record.'''
683 def __call__(self, char * buffer, int len):
686 r.copy( buffer, len )
689 cdef class TabixIteratorParsed:
690 """iterates over mapped reads in a region.
693 cdef ti_iter_t iterator
694 cdef tabix_t * tabixfile
704 assert tabixfile._isOpen()
707 # makes sure that samfile stays alive as long as the
709 self.tabixfile = tabixfile.tabixfile
712 # seek to start of file to ensure iteration is over
714 bgzf_seek( self.tabixfile.fp, 0, 0)
715 self.iterator = ti_iter_first()
717 self.iterator = ti_queryi(self.tabixfile, tid, start, end)
719 if <void*>self.iterator == NULL:
720 raise ValueError("malformatted query or wrong sequence name.\n")
726 """python version of next().
728 pyrex uses this non-standard name instead of next()
733 s = ti_read(self.tabixfile, self.iterator, &len)
734 if s == NULL: raise StopIteration
735 return self.parser(s, len)
737 def __dealloc__(self):
738 if <void*>self.iterator != NULL:
739 ti_iter_destroy(self.iterator)
741 def tabix_compress( filename_in,
746 compress *filename_in* writing the output to *filename_out*.
748 Raise an IOError if *filename_out* already exists, unless *force* is set.
751 if not force and os.path.exists(filename_out ):
752 raise IOError( "Filename '%s' already exists, use *force* to overwrite" % filename_out)
761 O_RDONLY = os.O_RDONLY
763 WINDOW_SIZE = 64 * 1024
765 fp = bgzf_open( filename_out, "w")
767 raise IOError( "could not open '%s' for writing" )
769 fd_src = open(filename_in, O_RDONLY)
771 raise IOError( "could not open '%s' for reading" )
773 buffer = malloc(WINDOW_SIZE)
776 c = read(fd_src, buffer, WINDOW_SIZE)
777 r = bgzf_write(fp, buffer, c)
780 raise OSError("writing failed")
784 if r < 0: raise OSError("writing failed")
786 def tabix_index( filename,
796 index tab-separated *filename* using tabix.
798 An existing index will not be overwritten unless
801 The index will be built from coordinates
802 in columns *seq_col*, *start_col* and *end_col*.
804 The contents of *filename* have to be sorted by
805 contig and position - the method does not check
806 if the file is sorted.
808 Column indices are 0-based. Coordinates in the file
809 are assumed to be 1-based.
811 If *preset* is provided, the column coordinates
812 are taken from a preset. Valid values for preset
813 are "gff", "bed", "sam", "vcf", psltbl", "pileup".
815 Lines beginning with *meta_char* and the first
816 *line_skip* lines will be skipped.
818 If *filename* does not end in ".gz", it will be automatically
819 compressed. The original file will be removed and only the
820 compressed file will be retained.
822 If *filename* ends in *gz*, the file is assumed to be already
823 compressed with bgzf.
825 returns the filename of the compressed data
828 if not os.path.exists(filename): raise IOError("No such file '%s'" % filename)
830 if not filename.endswith(".gz"):
832 tabix_compress( filename, filename + ".gz", force = force )
833 os.unlink( filename )
836 if not force and os.path.exists(filename + ".tbi" ):
837 raise IOError( "Filename '%s.tbi' already exists, use *force* to overwrite" )
840 # preset-code, contig, start, end, metachar for commends, lines to ignore at beginning
841 # 0 is a missing column
843 'gff' : ( 0, 1, 4, 5, ord('#'), 0 ),
844 'bed' : ( 0x10000, 1, 2, 3, ord('#'), 0 ),
845 'psltbl' : ( 0x10000, 15, 17, 18, ord('#'), 0 ),
846 'sam' : ( 1, 3, 4, 0, ord('#'), 0 ),
847 'vcf' : ( 2, 1, 2, 0, ord('#'), 0 ),
848 'pileup': (3, 1, 2, 0, ord('#'), 0 ),
853 conf_data = preset2conf[preset]
855 raise KeyError( "unknown preset '%s', valid presets are '%s'" % (preset, ",".join(preset2conf.keys() )))
857 if end_col == None: end_col = -1
860 # note that tabix internally works with 0-based coordinates and open/closed intervals.
861 # When using a preset, conversion is automatically taken care of.
862 # Otherwise, the coordinates are assumed to be 1-based closed intervals and
863 # -1 is subtracted from the start coordinate. To avoid doing this, set
864 # the TI_FLAG_UCSC=0x10000 flag:
865 if zerobased: preset = preset | 0x10000
867 conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0)
870 conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
872 ti_index_build( filename, &conf)
876 __all__ = ["tabix_index",