1 # cython: embedsignature=True
2 # adds doc-strings for sphinx
4 # Helper functions for python 3 compatibility - taken
6 import tempfile, os, sys, types, itertools, struct, ctypes, gzip
10 from cpython cimport PyErr_SetString, PyBytes_Check, \
11 PyUnicode_Check, PyBytes_FromStringAndSize, \
12 PyObject_AsFileDescriptor
14 PYTHON3 = PY_MAJOR_VERSION >= 3
16 # from cpython cimport PyString_FromStringAndSize, PyString_AS_STRING
17 from cpython.version cimport PY_MAJOR_VERSION
19 cdef from_string_and_size(char* s, size_t length):
20 if PY_MAJOR_VERSION < 3:
23 return s[:length].decode("ascii")
25 # filename encoding (copied from lxml.etree.pyx)
26 cdef str _FILENAME_ENCODING
27 _FILENAME_ENCODING = sys.getfilesystemencoding()
28 if _FILENAME_ENCODING is None:
29 _FILENAME_ENCODING = sys.getdefaultencoding()
30 if _FILENAME_ENCODING is None:
31 _FILENAME_ENCODING = 'ascii'
33 #cdef char* _C_FILENAME_ENCODING
34 #_C_FILENAME_ENCODING = <char*>_FILENAME_ENCODING
36 cdef bytes _my_encodeFilename(object filename):
37 u"""Make sure a filename is 8-bit encoded (or None).
41 elif PyBytes_Check(filename):
43 elif PyUnicode_Check(filename):
44 return filename.encode(_FILENAME_ENCODING)
46 raise TypeError, u"Argument must be string or unicode."
48 cdef bytes _force_bytes(object s):
49 u"""convert string or unicode object to bytes, assuming ascii encoding.
51 if PY_MAJOR_VERSION < 3:
55 elif PyBytes_Check(s):
57 elif PyUnicode_Check(s):
58 return s.encode('ascii')
60 raise TypeError, u"Argument must be string, bytes or unicode."
62 cdef inline bytes _force_cmdline_bytes(object s):
63 return _force_bytes(s)
65 cdef _charptr_to_str(char* s):
66 if PY_MAJOR_VERSION < 3:
69 return s.decode("ascii")
71 cdef _force_str(object s):
72 """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)"""
75 if PY_MAJOR_VERSION < 3:
77 elif PyBytes_Check(s):
78 return s.decode('ascii')
85 '''*(filename, mode='r')*
87 opens a :term:`tabix file` for reading. A missing
88 index (*filename* + ".tbi") will raise an exception.
90 def __cinit__(self, filename, mode = 'r', *args, **kwargs ):
92 self._open( filename, mode, *args, **kwargs )
95 '''return true if samfile has been opened.'''
96 return self.tabixfile != NULL
102 '''open a :term:`tabix file` for reading.
105 assert mode in ( "r",), "invalid file opening mode `%s`" % mode
107 # close a previously opened file
108 if self.tabixfile != NULL: self.close()
109 self.tabixfile = NULL
111 filename_index = filename + ".tbi"
112 self.isremote = filename.startswith( "http:") or filename.startswith( "ftp:" )
114 # encode all the strings
115 filename = _my_encodeFilename(filename)
116 filename_index = _my_encodeFilename(filename_index)
117 cdef bytes bmode = mode.encode('ascii')
119 if self._filename != NULL: free(self._filename )
121 self._filename = strdup(filename)
124 # open file for writing
125 raise NotImplementedError("writing to tabix files not implemented" )
128 # open file for reading
130 if not self.isremote:
131 if not os.path.exists( filename ):
132 raise IOError( "file `%s` not found" % filename)
134 if not os.path.exists( filename_index ):
135 raise IOError( "index `%s` not found" % filename_index)
137 # open file and load index
138 self.tabixfile = ti_open( filename, filename_index )
140 if self.tabixfile == NULL:
141 raise IOError("could not open file `%s`" % filename )
143 def _parseRegion( self,
148 '''parse region information.
150 raise ValueError for for invalid regions.
152 returns a tuple of region, tid, start and end. Region
153 is a valid samtools :term:`region` or None if the region
154 extends over the whole file.
156 Note that regions are 1-based, while start,end are python coordinates.
158 ti_lazy_index_load( self.tabixfile )
166 rtid = rstart = rend = 0
168 # translate to a region
170 if start != None and end != None:
171 region = "%s:%i-%i" % (reference, start+1, end)
172 elif start == None and end != None:
173 region = "%s:%i-%i" % (reference, 1, end)
174 elif end == None and start != None:
175 region = "%s:%i-%i" % (reference, start+1, max_pos-1)
180 region = _force_bytes(region)
181 ti_parse_region( self.tabixfile.idx, region,
182 &rtid, &rstart, &rend)
183 if rtid < 0: raise ValueError( "invalid region `%s`" % region )
184 if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
185 if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
186 if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend )
188 return region, rtid, rstart, rend
198 fetch one or more rows in a :term:`region` using 0-based indexing. The region is specified by
199 :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
201 Without *reference* or *region* all entries will be fetched.
203 If only *reference* is set, all reads matching on *reference* will be fetched.
205 If *parser* is None, the results are returned as an unparsed string.
206 Otherwise, *parser* is assumed to be a functor that will return parsed
207 data (see for example :meth:`asTuple` and :meth:`asGTF`).
209 ti_lazy_index_load( self.tabixfile )
211 if not self._isOpen():
212 raise ValueError( "I/O operation on closed file" )
214 region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
218 return TabixIterator( self, rtid, rstart, rend )
220 return TabixIterator( self, -1, 0, 0 )
223 return TabixIteratorParsed( self, rtid, rstart, rend, parser )
225 return TabixIteratorParsed( self, -1, 0, 0, parser )
227 ###############################################################
228 ###############################################################
229 ###############################################################
231 ###############################################################
233 '''filename associated with this object.'''
235 if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
236 return self._filename
242 The header is returned as an iterator over lines without the
247 return TabixHeaderIterator( self )
250 '''chromosome names'''
252 cdef char ** sequences
255 ti_lazy_index_load( self.tabixfile )
256 sequences = ti_seqname( self.tabixfile.idx, &nsequences )
259 for x from 0 <= x < nsequences:
260 result.append( sequences[x] )
265 closes the :class:`pysam.Tabixfile`.'''
266 if self.tabixfile != NULL:
267 ti_close( self.tabixfile )
268 self.tabixfile = NULL
270 def __dealloc__( self ):
271 # remember: dealloc cannot call other python methods
272 # note: no doc string
273 # note: __del__ is not called.
274 if self.tabixfile != NULL:
275 ti_close( self.tabixfile )
276 self.tabixfile = NULL
277 if self._filename != NULL: free( self._filename )
279 cdef class TabixIterator:
280 """iterates over rows in *tabixfile* in region
281 given by *tid*, *start* and *end*.
284 def __cinit__(self, Tabixfile tabixfile,
285 int tid, int start, int end ):
287 assert tabixfile._isOpen()
289 # makes sure that samfile stays alive as long as the
291 self.tabixfile = tabixfile.tabixfile
294 # seek to start of file to ensure iteration is over
296 bgzf_seek( self.tabixfile.fp, 0, 0)
297 self.iterator = ti_iter_first()
299 self.iterator = ti_queryi(self.tabixfile, tid, start, end)
301 if <void*>self.iterator == NULL:
302 raise ValueError("malformatted query or wrong sequence name.\n")
308 """python version of next().
310 pyrex uses this non-standard name instead of next()
315 # metachar filtering does not work within tabix
316 # though it should. Getting the metachar is a pain
317 # as ti_index_t is incomplete type.
319 # simply use '#' for now.
321 s = ti_read(self.tabixfile, self.iterator, &len)
322 if s == NULL: raise StopIteration
323 if s[0] != '#': break
325 retval = _charptr_to_str( s )
328 def __dealloc__(self):
329 if <void*>self.iterator != NULL:
330 ti_iter_destroy(self.iterator)
332 cdef class TabixHeaderIterator:
333 """return header lines.
336 def __cinit__(self, Tabixfile tabixfile ):
338 assert tabixfile._isOpen()
340 # makes sure that samfile stays alive as long as the
342 self.tabixfile = tabixfile.tabixfile
344 self.iterator = ti_query(self.tabixfile, NULL, 0, 0)
346 if <void*>self.iterator == NULL:
347 raise ValueError("can't open header.\n")
353 """python version of next().
355 pyrex uses this non-standard name instead of next()
361 # Getting the metachar is a pain as ti_index_t is incomplete type.
362 # simply use '#' for now.
363 s = ti_read(self.tabixfile, self.iterator, &len)
364 if s == NULL: raise StopIteration
365 # stop at first non-header line
366 if s[0] != '#': raise StopIteration
370 def __dealloc__(self):
371 if <void*>self.iterator != NULL:
372 ti_iter_destroy(self.iterator)
375 #########################################################
376 #########################################################
377 #########################################################
381 cdef class asTuple(Parser):
382 '''converts a :term:`tabix row` into a python tuple.
384 Access is by numeric index.
386 def __call__(self, char * buffer, int len):
387 cdef TabProxies.TupleProxy r
388 r = TabProxies.TupleProxy()
389 # need to copy - there were some
390 # persistence issues with "present"
391 r.copy( buffer, len )
394 cdef class asGTF(Parser):
395 '''converts a :term:`tabix row` into a GTF record with the following
405 genomic start coordinate (0-based)
407 genomic end coordinate plus one (0-based)
417 GTF formatted entries also defined the attributes:
422 the transcript identifier
425 def __call__(self, char * buffer, int len):
426 cdef TabProxies.GTFProxy r
427 r = TabProxies.GTFProxy()
428 r.copy( buffer, len )
431 cdef class asBed( Parser ):
432 '''converts a :term:`tabix row` into a bed record
433 with the following fields:
438 genomic start coordinate (zero-based)
440 genomic end coordinate plus one (zero-based)
456 ',' separated string of block sizes
458 ',' separated string of block genomic start positions
460 Only the first three fields are required. Additional
461 fields are optional, but if one is defined, all the preceeding
462 need to be defined as well.
465 def __call__(self, char * buffer, int len):
466 cdef TabProxies.BedProxy r
467 r = TabProxies.BedProxy()
468 r.copy( buffer, len )
471 cdef class asVCF( Parser ):
472 '''converts a :term:`tabix row` into a VCF record with
473 the following fields:
478 chromosomal position, zero-based
494 Access to genotypes is via index::
497 first_sample_genotype = vcf[0]
498 second_sample_genotype = vcf[1]
501 def __call__(self, char * buffer, int len ):
502 cdef TabProxies.VCFProxy r
503 r = TabProxies.VCFProxy()
504 r.copy( buffer, len )
507 #########################################################
508 #########################################################
509 #########################################################
510 cdef class TabixIteratorParsed:
511 """iterates over mapped reads in a region.
523 assert tabixfile._isOpen()
526 # makes sure that samfile stays alive as long as the
528 self.tabixfile = tabixfile.tabixfile
531 # seek to start of file to ensure iteration is over
533 bgzf_seek( self.tabixfile.fp, 0, 0)
534 self.iterator = ti_iter_first()
536 self.iterator = ti_queryi(self.tabixfile, tid, start, end)
538 if <void*>self.iterator == NULL:
539 raise ValueError("malformatted query or wrong sequence name.\n")
545 """python version of next().
547 pyrex uses this non-standard name instead of next()
553 s = ti_read(self.tabixfile, self.iterator, &len)
554 if s == NULL: raise StopIteration
555 # todo: read metachar from configuration
556 if s[0] != '#': break
558 return self.parser(s, len)
560 def __dealloc__(self):
561 if <void*>self.iterator != NULL:
562 ti_iter_destroy(self.iterator)
564 def tabix_compress( filename_in,
568 compress *filename_in* writing the output to *filename_out*.
570 Raise an IOError if *filename_out* already exists, unless *force* is set.
573 if not force and os.path.exists(filename_out ):
574 raise IOError( "Filename '%s' already exists, use *force* to overwrite" % filename_out)
583 O_RDONLY = os.O_RDONLY
585 WINDOW_SIZE = 64 * 1024
587 fn = _force_bytes(filename_out)
588 fp = bgzf_open( fn, "w")
590 raise IOError( "could not open '%s' for writing" )
592 fn = _force_bytes(filename_in)
593 fd_src = open(fn, O_RDONLY)
595 raise IOError( "could not open '%s' for reading" )
597 buffer = malloc(WINDOW_SIZE)
601 c = read(fd_src, buffer, WINDOW_SIZE)
602 r = bgzf_write(fp, buffer, c)
605 raise OSError("writing failed")
609 if r < 0: raise OSError("writing failed")
611 def tabix_index( filename,
621 index tab-separated *filename* using tabix.
623 An existing index will not be overwritten unless
626 The index will be built from coordinates
627 in columns *seq_col*, *start_col* and *end_col*.
629 The contents of *filename* have to be sorted by
630 contig and position - the method does not check
631 if the file is sorted.
633 Column indices are 0-based. Coordinates in the file
634 are assumed to be 1-based.
636 If *preset* is provided, the column coordinates
637 are taken from a preset. Valid values for preset
638 are "gff", "bed", "sam", "vcf", psltbl", "pileup".
640 Lines beginning with *meta_char* and the first
641 *line_skip* lines will be skipped.
643 If *filename* does not end in ".gz", it will be automatically
644 compressed. The original file will be removed and only the
645 compressed file will be retained.
647 If *filename* ends in *gz*, the file is assumed to be already
648 compressed with bgzf.
650 returns the filename of the compressed data
653 if not os.path.exists(filename): raise IOError("No such file '%s'" % filename)
655 if not filename.endswith(".gz"):
656 tabix_compress( filename, filename + ".gz", force = force )
657 os.unlink( filename )
660 if not force and os.path.exists(filename + ".tbi" ):
661 raise IOError( "Filename '%s.tbi' already exists, use *force* to overwrite" )
664 # preset-code, contig, start, end, metachar for commends, lines to ignore at beginning
665 # 0 is a missing column
667 'gff' : ( 0, 1, 4, 5, ord('#'), 0 ),
668 'bed' : ( 0x10000, 1, 2, 3, ord('#'), 0 ),
669 'psltbl' : ( 0x10000, 15, 17, 18, ord('#'), 0 ),
670 'sam' : ( 1, 3, 4, 0, ord('#'), 0 ),
671 'vcf' : ( 2, 1, 2, 0, ord('#'), 0 ),
672 'pileup': (3, 1, 2, 0, ord('#'), 0 ),
677 conf_data = preset2conf[preset]
679 raise KeyError( "unknown preset '%s', valid presets are '%s'" % (preset, ",".join(preset2conf.keys() )))
681 if end_col == None: end_col = -1
684 # note that tabix internally works with 0-based coordinates and open/closed intervals.
685 # When using a preset, conversion is automatically taken care of.
686 # Otherwise, the coordinates are assumed to be 1-based closed intervals and
687 # -1 is subtracted from the start coordinate. To avoid doing this, set
688 # the TI_FLAG_UCSC=0x10000 flag:
689 if zerobased: preset = preset | 0x10000
691 conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0)
694 conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
696 fn = _my_encodeFilename( filename )
697 ti_index_build( fn, &conf)
701 #########################################################
702 #########################################################
703 #########################################################
704 ## Iterators for parsing through unindexed files.
705 #########################################################
706 ctypedef class tabix_inplace_iterator:
707 '''iterate over ``infile``.
709 This iterator is not safe. If the :meth:`__next__()` method is called
710 after ``infile`` is closed, the result is undefined (see ``fclose()``).
712 The iterator might either raise a StopIteration or segfault.
716 def __cinit__(self, infile, int buffer_size = 65536 ):
718 cdef int fd = PyObject_AsFileDescriptor( infile )
719 if fd == -1: raise ValueError( "I/O operation on closed file." )
720 self.infile = fdopen( fd, 'r')
722 if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
724 self.buffer = <char*>malloc( buffer_size )
725 self.size = buffer_size
730 cdef __cnext__(self):
737 while not feof( self.infile ):
738 nbytes = getline( &b, &self.size, self.infile)
740 # stop at first error or eof
741 if (nbytes == -1): break
743 if (b[0] == '#'): continue
746 if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
748 # make sure that entry is complete
749 if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
751 raise ValueError( "incomplete line at %s" % result )
753 # make sure that this goes fully through C
754 # otherwise buffer is copied to/from a
755 # Python object causing segfaults as
756 # the wrong memory is freed
757 r.present( b, nbytes )
762 def __dealloc__(self):
766 return self.__cnext__()
768 ctypedef class tabix_copy_iterator:
769 '''iterate over ``infile``.
771 This iterator is not save. If the :meth:`__next__()` method is called
772 after ``infile`` is closed, the result is undefined (see ``fclose()``).
774 The iterator might either raise a StopIteration or segfault.
777 def __cinit__(self, infile, Parser parser ):
779 cdef int fd = PyObject_AsFileDescriptor( infile )
780 if fd == -1: raise ValueError( "I/O operation on closed file." )
781 self.infile = fdopen( fd, 'r')
782 if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
788 cdef __cnext__(self):
796 while not feof( self.infile ):
798 # getline allocates on demand
799 # return number of characters read excluding null byte
800 nbytes = getline( &b, &nbytes, self.infile)
801 # stop at first error
802 if (nbytes == -1): break
804 if (b[0] == '#'): continue
807 if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
809 # make sure that entry is complete
810 if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
813 raise ValueError( "incomplete line at %s" % result )
815 # make sure that this goes fully through C
816 # otherwise buffer is copied to/from a
817 # Python object causing segfaults as
818 # the wrong memory is freed
819 # -1 to remove the new-line character
820 return self.parser(b, nbytes)
826 return self.__cnext__()
828 class tabix_generic_iterator:
829 '''iterate over ``infile``.
831 Permits the use of file-like objects for example from the gzip module.
833 def __init__(self, infile, parser ):
836 if self.infile.closed: raise ValueError( "I/O operation on closed file." )
842 # cython version - required for python 3
849 line = self.infile.readline()
852 s = _force_bytes( line )
855 assert b[nbytes] == '\0'
858 if (b[0] == '#'): continue
861 if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
863 # make sure that entry is complete
864 if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
865 raise ValueError( "incomplete line at %s" % line )
868 cpy = <char*>malloc(nbytes+1)
869 if cpy == NULL: raise MemoryError()
870 memcpy( cpy, b, nbytes+1)
872 return self.parser(cpy, nbytes)
876 # python version - required for python 2.7
878 return self.__next__()
880 def tabix_iterator( infile, parser ):
881 """return an iterator over all entries in a file."""
882 return tabix_generic_iterator( infile, parser )
883 # file objects can use C stdio
884 # used to be: isinstance( infile, file):
886 # if isinstance( infile, io.IOBase ):
887 # return tabix_copy_iterator( infile, parser )
889 # return tabix_generic_iterator( infile, parser )
891 # if isinstance( infile, file ):
892 # return tabix_copy_iterator( infile, parser )
894 # return tabix_generic_iterator( infile, parser )
896 __all__ = ["tabix_index",
904 "tabix_inplace_iterator"