X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=pysam.git;a=blobdiff_plain;f=pysam%2Fctabix.pyx;fp=pysam%2Fctabix.pyx;h=a522676c6a429b212a77b82b82febb4d2d3d5b98;hp=a31f3f244b37329286cc23760d5c2942b8628103;hb=68c074fc81858150ca7447a88e731eec96be6378;hpb=768881ffd9d33e3c5fa00dd9ea6f488f4f0700b3 diff --git a/pysam/ctabix.pyx b/pysam/ctabix.pyx index a31f3f2..a522676 100644 --- a/pysam/ctabix.pyx +++ b/pysam/ctabix.pyx @@ -1,27 +1,102 @@ # cython: embedsignature=True # adds doc-strings for sphinx +# Helper functions for python 3 compatibility - taken +# from csamtools.pyx import tempfile, os, sys, types, itertools, struct, ctypes, gzip -from cpython cimport PyString_FromStringAndSize, PyString_AS_STRING +import io cimport TabProxies +from cpython cimport PyErr_SetString, PyBytes_Check, \ + PyUnicode_Check, PyBytes_FromStringAndSize, \ + PyObject_AsFileDescriptor + +PYTHON3 = PY_MAJOR_VERSION >= 3 + +# from cpython cimport PyString_FromStringAndSize, PyString_AS_STRING +from cpython.version cimport PY_MAJOR_VERSION + +cdef from_string_and_size(char* s, size_t length): + if PY_MAJOR_VERSION < 3: + return s[:length] + else: + return s[:length].decode("ascii") + +# filename encoding (copied from lxml.etree.pyx) +cdef str _FILENAME_ENCODING +_FILENAME_ENCODING = sys.getfilesystemencoding() +if _FILENAME_ENCODING is None: + _FILENAME_ENCODING = sys.getdefaultencoding() +if _FILENAME_ENCODING is None: + _FILENAME_ENCODING = 'ascii' + +#cdef char* _C_FILENAME_ENCODING +#_C_FILENAME_ENCODING = _FILENAME_ENCODING + +cdef bytes _my_encodeFilename(object filename): + u"""Make sure a filename is 8-bit encoded (or None). + """ + if filename is None: + return None + elif PyBytes_Check(filename): + return filename + elif PyUnicode_Check(filename): + return filename.encode(_FILENAME_ENCODING) + else: + raise TypeError, u"Argument must be string or unicode." + +cdef bytes _force_bytes(object s): + u"""convert string or unicode object to bytes, assuming ascii encoding. + """ + if PY_MAJOR_VERSION < 3: + return s + elif s is None: + return None + elif PyBytes_Check(s): + return s + elif PyUnicode_Check(s): + return s.encode('ascii') + else: + raise TypeError, u"Argument must be string, bytes or unicode." + +cdef inline bytes _force_cmdline_bytes(object s): + return _force_bytes(s) + +cdef _charptr_to_str(char* s): + if PY_MAJOR_VERSION < 3: + return s + else: + return s.decode("ascii") + +cdef _force_str(object s): + """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)""" + if s is None: + return None + if PY_MAJOR_VERSION < 3: + return s + elif PyBytes_Check(s): + return s.decode('ascii') + else: + # assume unicode + return s + + cdef class Tabixfile: '''*(filename, mode='r')* opens a :term:`tabix file` for reading. A missing index (*filename* + ".tbi") will raise an exception. ''' - - def __cinit__(self, *args, **kwargs ): + def __cinit__(self, filename, mode = 'r', *args, **kwargs ): self.tabixfile = NULL - self._open( *args, **kwargs ) + self._open( filename, mode, *args, **kwargs ) def _isOpen( self ): '''return true if samfile has been opened.''' return self.tabixfile != NULL def _open( self, - char * filename, + filename, mode ='r', ): '''open a :term:`tabix file` for reading. @@ -33,25 +108,34 @@ cdef class Tabixfile: if self.tabixfile != NULL: self.close() self.tabixfile = NULL + filename_index = filename + ".tbi" + self.isremote = filename.startswith( "http:") or filename.startswith( "ftp:" ) + + # encode all the strings + filename = _my_encodeFilename(filename) + filename_index = _my_encodeFilename(filename_index) + cdef bytes bmode = mode.encode('ascii') + if self._filename != NULL: free(self._filename ) - self._filename = strdup( filename ) - filename_index = filename + ".tbi" + self._filename = strdup(filename) if mode[0] == 'w': # open file for writing - pass + raise NotImplementedError("writing to tabix files not implemented" ) elif mode[0] == "r": # open file for reading - if not os.path.exists( self._filename ): - raise IOError( "file `%s` not found" % self._filename) + + if not self.isremote: + if not os.path.exists( filename ): + raise IOError( "file `%s` not found" % filename) - if not os.path.exists( filename_index ): - raise IOError( "index `%s` not found" % filename_index) + if not os.path.exists( filename_index ): + raise IOError( "index `%s` not found" % filename_index) # open file and load index - self.tabixfile = ti_open( self._filename, filename_index ) + self.tabixfile = ti_open( filename, filename_index ) if self.tabixfile == NULL: raise IOError("could not open file `%s`" % filename ) @@ -93,7 +177,9 @@ cdef class Tabixfile: region = reference if region: - ti_parse_region( self.tabixfile.idx, region, &rtid, &rstart, &rend) + region = _force_bytes(region) + ti_parse_region( self.tabixfile.idx, region, + &rtid, &rstart, &rend) if rtid < 0: raise ValueError( "invalid region `%s`" % region ) if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) ) if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart ) @@ -195,9 +281,6 @@ cdef class TabixIterator: given by *tid*, *start* and *end*. """ - cdef ti_iter_t iterator - cdef tabix_t * tabixfile - def __cinit__(self, Tabixfile tabixfile, int tid, int start, int end ): @@ -239,7 +322,8 @@ cdef class TabixIterator: if s == NULL: raise StopIteration if s[0] != '#': break - return s + retval = _charptr_to_str( s ) + return retval def __dealloc__(self): if self.iterator != NULL: @@ -249,9 +333,6 @@ cdef class TabixHeaderIterator: """return header lines. """ - cdef ti_iter_t iterator - cdef tabix_t * tabixfile - def __cinit__(self, Tabixfile tabixfile ): assert tabixfile._isOpen() @@ -290,6 +371,7 @@ cdef class TabixHeaderIterator: if self.iterator != NULL: ti_iter_destroy(self.iterator) + ######################################################### ######################################################### ######################################################### @@ -431,10 +513,6 @@ cdef class TabixIteratorParsed: Returns parsed data. """ - cdef ti_iter_t iterator - cdef tabix_t * tabixfile - cdef Parser parser - def __cinit__(self, Tabixfile tabixfile, int tid, @@ -484,9 +562,8 @@ cdef class TabixIteratorParsed: ti_iter_destroy(self.iterator) def tabix_compress( filename_in, - filename_out, - force = False ): - + filename_out, + force = False ): ''' compress *filename_in* writing the output to *filename_out*. @@ -507,15 +584,18 @@ def tabix_compress( filename_in, WINDOW_SIZE = 64 * 1024 - fp = bgzf_open( filename_out, "w") + fn = _force_bytes(filename_out) + fp = bgzf_open( fn, "w") if fp == NULL: raise IOError( "could not open '%s' for writing" ) - fd_src = open(filename_in, O_RDONLY) + fn = _force_bytes(filename_in) + fd_src = open(fn, O_RDONLY) if fd_src == 0: raise IOError( "could not open '%s' for reading" ) buffer = malloc(WINDOW_SIZE) + c = 1 while c > 0: c = read(fd_src, buffer, WINDOW_SIZE) @@ -573,7 +653,6 @@ def tabix_index( filename, if not os.path.exists(filename): raise IOError("No such file '%s'" % filename) if not filename.endswith(".gz"): - tabix_compress( filename, filename + ".gz", force = force ) os.unlink( filename ) filename += ".gz" @@ -614,9 +693,205 @@ def tabix_index( filename, cdef ti_conf_t conf conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data - ti_index_build( filename, &conf) + fn = _my_encodeFilename( filename ) + ti_index_build( fn, &conf) return filename + +######################################################### +######################################################### +######################################################### +## Iterators for parsing through unindexed files. +######################################################### +ctypedef class tabix_inplace_iterator: + '''iterate over ``infile``. + + This iterator is not safe. If the :meth:`__next__()` method is called + after ``infile`` is closed, the result is undefined (see ``fclose()``). + + The iterator might either raise a StopIteration or segfault. + ''' + + + def __cinit__(self, infile, int buffer_size = 65536 ): + + cdef int fd = PyObject_AsFileDescriptor( infile ) + if fd == -1: raise ValueError( "I/O operation on closed file." ) + self.infile = fdopen( fd, 'r') + + if self.infile == NULL: raise ValueError( "I/O operation on closed file." ) + + self.buffer = malloc( buffer_size ) + self.size = buffer_size + + def __iter__(self): + return self + + cdef __cnext__(self): + + cdef char * b + cdef size_t nbytes + b = self.buffer + r = self.Parser() + + while not feof( self.infile ): + nbytes = getline( &b, &self.size, self.infile) + + # stop at first error or eof + if (nbytes == -1): break + # skip comments + if (b[0] == '#'): continue + + # skip empty lines + if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue + + # make sure that entry is complete + if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': + result = b + raise ValueError( "incomplete line at %s" % result ) + + # make sure that this goes fully through C + # otherwise buffer is copied to/from a + # Python object causing segfaults as + # the wrong memory is freed + r.present( b, nbytes ) + return r + + raise StopIteration + + def __dealloc__(self): + free(self.buffer) + + def __next__(self): + return self.__cnext__() + +ctypedef class tabix_copy_iterator: + '''iterate over ``infile``. + + This iterator is not save. If the :meth:`__next__()` method is called + after ``infile`` is closed, the result is undefined (see ``fclose()``). + + The iterator might either raise a StopIteration or segfault. + ''' + + def __cinit__(self, infile, Parser parser ): + + cdef int fd = PyObject_AsFileDescriptor( infile ) + if fd == -1: raise ValueError( "I/O operation on closed file." ) + self.infile = fdopen( fd, 'r') + if self.infile == NULL: raise ValueError( "I/O operation on closed file." ) + self.parser = parser + + def __iter__(self): + return self + + cdef __cnext__(self): + + cdef char * b + cdef size_t nbytes + cdef int x + + b = NULL + + while not feof( self.infile ): + + # getline allocates on demand + # return number of characters read excluding null byte + nbytes = getline( &b, &nbytes, self.infile) + # stop at first error + if (nbytes == -1): break + # skip comments + if (b[0] == '#'): continue + + # skip empty lines + if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue + + # make sure that entry is complete + if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': + result = b + free(b) + raise ValueError( "incomplete line at %s" % result ) + + # make sure that this goes fully through C + # otherwise buffer is copied to/from a + # Python object causing segfaults as + # the wrong memory is freed + # -1 to remove the new-line character + return self.parser(b, nbytes) + + free(b) + raise StopIteration + + def __next__(self): + return self.__cnext__() + +class tabix_generic_iterator: + '''iterate over ``infile``. + + Permits the use of file-like objects for example from the gzip module. + ''' + def __init__(self, infile, parser ): + + self.infile = infile + if self.infile.closed: raise ValueError( "I/O operation on closed file." ) + self.parser = parser + + def __iter__(self): + return self + + # cython version - required for python 3 + def __next__(self): + + cdef char * b, * cpy + cdef size_t nbytes + while 1: + + line = self.infile.readline() + if not line: break + + s = _force_bytes( line ) + b = s + nbytes = len( line ) + assert b[nbytes] == '\0' + + # skip comments + if (b[0] == '#'): continue + + # skip empty lines + if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue + + # make sure that entry is complete + if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': + raise ValueError( "incomplete line at %s" % line ) + + # create a copy + cpy = malloc(nbytes+1) + if cpy == NULL: raise MemoryError() + memcpy( cpy, b, nbytes+1) + + return self.parser(cpy, nbytes) + + raise StopIteration + + # python version - required for python 2.7 + def next(self): + return self.__next__() + +def tabix_iterator( infile, parser ): + """return an iterator over all entries in a file.""" + return tabix_generic_iterator( infile, parser ) + # file objects can use C stdio + # used to be: isinstance( infile, file): + # if PYTHON3: + # if isinstance( infile, io.IOBase ): + # return tabix_copy_iterator( infile, parser ) + # else: + # return tabix_generic_iterator( infile, parser ) + # else: +# if isinstance( infile, file ): +# return tabix_copy_iterator( infile, parser ) +# else: +# return tabix_generic_iterator( infile, parser ) __all__ = ["tabix_index", "tabix_compress", @@ -625,4 +900,6 @@ __all__ = ["tabix_index", "asGTF", "asVCF", "asBed", + "tabix_iterator", + "tabix_inplace_iterator" ]