# cython: embedsignature=True
# adds doc-strings for sphinx
-import tempfile, os, sys, types, itertools, struct, ctypes
+# Helper functions for python 3 compatibility - taken
+# from csamtools.pyx
+import tempfile, os, sys, types, itertools, struct, ctypes, gzip
+import io
+cimport TabProxies
+
+from cpython cimport PyErr_SetString, PyBytes_Check, \
+ PyUnicode_Check, PyBytes_FromStringAndSize, \
+ PyObject_AsFileDescriptor
+
+PYTHON3 = PY_MAJOR_VERSION >= 3
+
+# from cpython cimport PyString_FromStringAndSize, PyString_AS_STRING
+from cpython.version cimport PY_MAJOR_VERSION
+
+cdef from_string_and_size(char* s, size_t length):
+ if PY_MAJOR_VERSION < 3:
+ return s[:length]
+ else:
+ return s[:length].decode("ascii")
+
+# filename encoding (copied from lxml.etree.pyx)
+cdef str _FILENAME_ENCODING
+_FILENAME_ENCODING = sys.getfilesystemencoding()
+if _FILENAME_ENCODING is None:
+ _FILENAME_ENCODING = sys.getdefaultencoding()
+if _FILENAME_ENCODING is None:
+ _FILENAME_ENCODING = 'ascii'
+
+#cdef char* _C_FILENAME_ENCODING
+#_C_FILENAME_ENCODING = <char*>_FILENAME_ENCODING
+
+cdef bytes _my_encodeFilename(object filename):
+ u"""Make sure a filename is 8-bit encoded (or None).
+ """
+ if filename is None:
+ return None
+ elif PyBytes_Check(filename):
+ return filename
+ elif PyUnicode_Check(filename):
+ return filename.encode(_FILENAME_ENCODING)
+ else:
+ raise TypeError, u"Argument must be string or unicode."
+
+cdef bytes _force_bytes(object s):
+ u"""convert string or unicode object to bytes, assuming ascii encoding.
+ """
+ if PY_MAJOR_VERSION < 3:
+ return s
+ elif s is None:
+ return None
+ elif PyBytes_Check(s):
+ return s
+ elif PyUnicode_Check(s):
+ return s.encode('ascii')
+ else:
+ raise TypeError, u"Argument must be string, bytes or unicode."
+
+cdef inline bytes _force_cmdline_bytes(object s):
+ return _force_bytes(s)
+
+cdef _charptr_to_str(char* s):
+ if PY_MAJOR_VERSION < 3:
+ return s
+ else:
+ return s.decode("ascii")
+
+cdef _force_str(object s):
+ """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)"""
+ if s is None:
+ return None
+ if PY_MAJOR_VERSION < 3:
+ return s
+ elif PyBytes_Check(s):
+ return s.decode('ascii')
+ else:
+ # assume unicode
+ return s
+
cdef class Tabixfile:
'''*(filename, mode='r')*
opens a :term:`tabix file` for reading. A missing
index (*filename* + ".tbi") will raise an exception.
'''
-
- cdef char * filename
-
- # pointer to tabixfile
- cdef tabix_t * tabixfile
-
- def __cinit__(self, *args, **kwargs ):
+ def __cinit__(self, filename, mode = 'r', *args, **kwargs ):
self.tabixfile = NULL
- self._open( *args, **kwargs )
+ self._open( filename, mode, *args, **kwargs )
def _isOpen( self ):
'''return true if samfile has been opened.'''
return self.tabixfile != NULL
def _open( self,
- char * filename,
+ filename,
mode ='r',
):
'''open a :term:`tabix file` for reading.
if self.tabixfile != NULL: self.close()
self.tabixfile = NULL
- self.filename = filename
filename_index = filename + ".tbi"
+ self.isremote = filename.startswith( "http:") or filename.startswith( "ftp:" )
+
+ # encode all the strings
+ filename = _my_encodeFilename(filename)
+ filename_index = _my_encodeFilename(filename_index)
+ cdef bytes bmode = mode.encode('ascii')
+
+ if self._filename != NULL: free(self._filename )
+
+ self._filename = strdup(filename)
if mode[0] == 'w':
# open file for writing
- pass
+ raise NotImplementedError("writing to tabix files not implemented" )
elif mode[0] == "r":
# open file for reading
- if not os.path.exists( self.filename ):
- raise IOError( "file `%s` not found" % self.filename)
+
+ if not self.isremote:
+ if not os.path.exists( filename ):
+ raise IOError( "file `%s` not found" % filename)
- if not os.path.exists( filename_index ):
- raise IOError( "index `%s` not found" % filename_index)
+ if not os.path.exists( filename_index ):
+ raise IOError( "index `%s` not found" % filename_index)
# open file and load index
- self.tabixfile = ti_open( self.filename, filename_index )
+ self.tabixfile = ti_open( filename, filename_index )
if self.tabixfile == NULL:
raise IOError("could not open file `%s`" % filename )
region = reference
if region:
- ti_parse_region( self.tabixfile.idx, region, &rtid, &rstart, &rend)
+ region = _force_bytes(region)
+ ti_parse_region( self.tabixfile.idx, region,
+ &rtid, &rstart, &rend)
if rtid < 0: raise ValueError( "invalid region `%s`" % region )
if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
else:
return TabixIteratorParsed( self, -1, 0, 0, parser )
+ ###############################################################
+ ###############################################################
+ ###############################################################
+ ## properties
+ ###############################################################
+ property filename:
+ '''filename associated with this object.'''
+ def __get__(self):
+ if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+ return self._filename
+
+ property header:
+ '''the file header.
+
+ .. note::
+ The header is returned as an iterator over lines without the
+ newline character.
+ '''
+
+ def __get__( self ):
+ return TabixHeaderIterator( self )
+
property contigs:
- '''chromosome names'''
- def __get__(self):
- cdef char ** sequences
- cdef int nsequences
+ '''chromosome names'''
+ def __get__(self):
+ cdef char ** sequences
+ cdef int nsequences
- ti_lazy_index_load( self.tabixfile )
- sequences = ti_seqname( self.tabixfile.idx, &nsequences )
- cdef int x
- result = []
- for x from 0 <= x < nsequences:
- result.append( sequences[x] )
- return result
+ ti_lazy_index_load( self.tabixfile )
+ sequences = ti_seqname( self.tabixfile.idx, &nsequences )
+ cdef int x
+ result = []
+ for x from 0 <= x < nsequences:
+ result.append( sequences[x] )
+ return result
+ def close( self ):
+ '''
+ closes the :class:`pysam.Tabixfile`.'''
+ if self.tabixfile != NULL:
+ ti_close( self.tabixfile )
+ self.tabixfile = NULL
+
+ def __dealloc__( self ):
+ # remember: dealloc cannot call other python methods
+ # note: no doc string
+ # note: __del__ is not called.
+ if self.tabixfile != NULL:
+ ti_close( self.tabixfile )
+ self.tabixfile = NULL
+ if self._filename != NULL: free( self._filename )
+
cdef class TabixIterator:
"""iterates over rows in *tabixfile* in region
given by *tid*, *start* and *end*.
"""
- cdef ti_iter_t iterator
- cdef tabix_t * tabixfile
-
def __cinit__(self, Tabixfile tabixfile,
int tid, int start, int end ):
cdef char * s
cdef int len
- s = ti_read(self.tabixfile, self.iterator, &len)
- if s == NULL: raise StopIteration
- return s
+ # metachar filtering does not work within tabix
+ # though it should. Getting the metachar is a pain
+ # as ti_index_t is incomplete type.
+
+ # simply use '#' for now.
+ while 1:
+ s = ti_read(self.tabixfile, self.iterator, &len)
+ if s == NULL: raise StopIteration
+ if s[0] != '#': break
+
+ retval = _charptr_to_str( s )
+ return retval
def __dealloc__(self):
if <void*>self.iterator != NULL:
ti_iter_destroy(self.iterator)
-def toDot( v ):
- '''convert value to '.' if None'''
- if v == None: return "."
- else: return str(v)
-
-def quote( v ):
- '''return a quoted attribute.'''
- if type(v) in types.StringTypes:
- return '"%s"' % v
- else:
- return str(v)
-
-cdef class TupleProxy:
- '''Proxy class for access to parsed row as a tuple.
-
- This class represents a table row for fast read-access.
- '''
-
- cdef:
- char * data
- char ** fields
- int nfields
- int index
-
- def __cinit__(self ):
-
- self.data = NULL
- self.fields = NULL
- self.index = 0
-
- cdef take( self, char * buffer, size_t nbytes ):
- '''start presenting buffer.
-
- Take ownership of the pointer.
- '''
- self.data = buffer
- self.update( buffer, nbytes )
-
- cdef present( self, char * buffer, size_t nbytes ):
- '''start presenting buffer.
-
- Do not take ownership of the pointer.
- '''
- self.update( buffer, nbytes )
-
- cdef copy( self, char * buffer, size_t nbytes ):
- '''start presenting buffer.
-
- Take a copy of buffer.
- '''
- cdef int s
- # +1 for '\0'
- s = sizeof(char) * (nbytes + 1)
- self.data = <char*>malloc( s )
- memcpy( <char*>self.data, buffer, s )
- self.update( self.data, nbytes )
-
- cdef update( self, char * buffer, size_t nbytes ):
- '''update internal data.'''
- cdef char * pos
- cdef char * old_pos
- cdef int field
- cdef int max_fields
- field = 0
-
- if buffer[nbytes] != 0:
- raise ValueError( "incomplete line at %s" % buffer )
-
- if self.fields != NULL:
- free(self.fields)
-
- max_fields = nbytes / 4
- self.fields = <char **>calloc( max_fields, sizeof(char *) )
+cdef class TabixHeaderIterator:
+ """return header lines.
+ """
+
+ def __cinit__(self, Tabixfile tabixfile ):
- pos = buffer
- self.fields[0] = pos
- field += 1
- old_pos = pos
+ assert tabixfile._isOpen()
- while 1:
-
- pos = <char*>memchr( pos, '\t', nbytes )
- if pos == NULL: break
- pos[0] = '\0'
- pos += 1
- self.fields[field] = pos
- field += 1
- if field >= max_fields:
- raise ValueError("row too large - more than %i fields" % max_fields )
- nbytes -= pos - old_pos
- if nbytes < 0: break
- old_pos = pos
-
- self.nfields = field
-
- def __getitem__( self, key ):
-
- cdef int i
- i = key
- if i < 0: i += self.nfields
- if i >= self.nfields or i < 0:
- raise IndexError( "list index out of range" )
- return self.fields[i]
+ # makes sure that samfile stays alive as long as the
+ # iterator is alive.
+ self.tabixfile = tabixfile.tabixfile
- def __len__(self):
- return self.nfields
+ self.iterator = ti_query(self.tabixfile, NULL, 0, 0)
- def __dealloc__(self):
- if self.data != NULL:
- free(self.data)
+ if <void*>self.iterator == NULL:
+ raise ValueError("can't open header.\n")
def __iter__(self):
- self.index = 0
- return self
+ return self
def __next__(self):
"""python version of next().
- """
- if self.index >= self.nfields:
- raise StopIteration
- self.index += 1
- return self.fields[self.index-1]
-cdef class GTFProxy:
- '''Proxy class for access to GTF fields.
-
- This class represents a GTF entry for fast read-access.
- Write-access has been added as well, though some care must
- be taken. If any of the string fields (contig, source, ...)
- are set, the new value is tied to the lifetime of the
- argument that was supplied.
-
- The only exception is the attributes field when set from
- a dictionary - this field will manage its own memory.
-
- '''
-
- cdef:
- char * contig
- char * source
- char * feature
- uint32_t start
- uint32_t end
- char * score
- char * strand
- char * frame
- char * attributes
- int nbytes
- char * data
- cdef bint isModified
- cdef bint hasOwnAttributes
-
- def __cinit__(self ):
- self.data = NULL
- self.isModified = False
- self.hasOwnAttributes = False
-
- cdef take( self, char * buffer, size_t nbytes ):
- '''start presenting buffer.
-
- Take ownership of the pointer.
- '''
- self.data = buffer
- self.update( buffer, nbytes )
- self.isModified = False
-
- cdef present( self, char * buffer, size_t nbytes ):
- '''start presenting buffer.
-
- Do not take ownership of the pointer.
- '''
- self.update( buffer, nbytes )
- self.isModified = False
-
- cdef copy( self, char * buffer, size_t nbytes ):
- '''start presenting buffer.
-
- Take a copy of buffer.
- '''
- cdef int s
- # +1 for '\0'
- s = sizeof(char) * (nbytes + 1)
- self.data = <char*>malloc( s )
- memcpy( <char*>self.data, buffer, s )
- self.update( self.data, nbytes )
- self.isModified = False
-
- cdef update( self, char * buffer, size_t nbytes ):
- '''update internal data.
-
- nbytes does not include the terminal '\0'.
- '''
- cdef int end
- cdef char * cstart, * cend, * cscore
- self.contig = buffer
- self.nbytes = nbytes
- cdef char * pos
-
- if buffer[nbytes] != 0:
- raise ValueError( "incomplete line at %s" % buffer )
-
- pos = strchr( buffer, '\t' )
- if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
- pos[0] = '\0'
- pos += 1
- self.source = pos
-
- pos = strchr( pos, '\t' )
- if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
- pos[0] = '\0'
- pos += 1
- self.feature = pos
-
- pos = strchr( pos, '\t' )
- if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
- pos[0] = '\0'
- pos += 1
- cstart = pos
-
- pos = strchr( pos, '\t' )
- if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
- pos[0] = '\0'
- pos += 1
- cend = pos
-
- pos = strchr( pos, '\t' )
- if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
- pos[0] = '\0'
- pos += 1
- self.score = pos
-
- pos = strchr( pos, '\t' )
- if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
- pos[0] = '\0'
- pos += 1
- self.strand = pos
-
- pos = strchr( pos, '\t' )
- if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
- pos[0] = '\0'
- pos += 1
- self.frame = pos
-
- pos = strchr( pos, '\t' )
- if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
- pos[0] = '\0'
- pos += 1
- self.attributes = pos
- self.start = atoi( cstart ) - 1
- self.end = atoi( cend )
-
- property contig:
- '''contig of feature.'''
- def __get__( self ): return self.contig
- def __set__( self, value ):
- self.isModified = True
- self.contig = value
-
- property feature:
- '''feature name.'''
- def __get__( self ): return self.feature
- def __set__( self, value ):
- self.isModified = True
- self.feature = value
-
- property source:
- '''feature source.'''
- def __get__( self ): return self.source
- def __set__( self, value ):
- self.isModified = True
- self.source = value
-
- property start:
- '''feature start (in 0-based open/closed coordinates).'''
- def __get__( self ): return self.start
- def __set__( self, value ):
- self.isModified = True
- self.start = value
-
- property end:
- '''feature end (in 0-based open/closed coordinates).'''
- def __get__( self ): return self.end
- def __set__( self, value ):
- self.isModified = True
- self.end = value
-
- property score:
- '''feature score.'''
- def __get__( self ):
- if self.score[0] == '.' and self.score[1] == '\0' :
- return None
- else:
- return atof(self.score)
- def __set__( self, value ):
- self.isModified = True
- self.score = value
-
- property strand:
- '''feature strand.'''
- def __get__( self ): return self.strand
- def __set__( self, value ):
- self.isModified = True
- self.strand = value
-
- property frame:
- '''feature frame.'''
- def __get__( self ): return self.frame
- def __set__( self, value ):
- self.isModified = True
- self.frame = value
-
- property attributes:
- '''feature attributes (as a string).'''
- def __get__( self ): return self.attributes
- def __set__( self, value ):
- self.isModified = True
- self.attributes = value
-
- def asDict( self ):
- """parse attributes - return as dict
+ pyrex uses this non-standard name instead of next()
"""
-
- # remove comments
- attributes = self.attributes
-
- # separate into fields
- fields = [ x.strip() for x in attributes.split(";")[:-1]]
-
- result = {}
-
- for f in fields:
-
- d = [ x.strip() for x in f.split(" ")]
-
- n,v = d[0], d[1]
- if len(d) > 2: v = d[1:]
-
- if v[0] == '"' and v[-1] == '"':
- v = v[1:-1]
- else:
- ## try to convert to a value
- try:
- v = float( v )
- v = int( v )
- except ValueError:
- pass
- except TypeError:
- pass
-
- result[n] = v
-
- return result
- def fromDict( self, d ):
- '''set attributes from a dictionary.'''
- cdef char * p
- cdef int l
-
- # clean up if this field is set twice
- if self.hasOwnAttributes:
- free(self.attributes)
-
- aa = []
- for k,v in d.items():
- if type(v) == types.StringType:
- aa.append( '%s "%s"' % (k,v) )
- else:
- aa.append( '%s %s' % (k,str(v)) )
-
- a = "; ".join( aa ) + ";"
- p = a
- l = len(a)
- self.attributes = <char *>calloc( l + 1, sizeof(char) )
- memcpy( self.attributes, p, l )
-
- self.hasOwnAttributes = True
- self.isModified = True
-
- def __str__(self):
- cdef char * cpy
- cdef int x
-
- if self.isModified:
- return "\t".join(
- (self.contig,
- self.source,
- self.feature,
- str(self.start+1),
- str(self.end),
- toDot(self.score),
- self.strand,
- self.frame,
- self.attributes ) )
- else:
- cpy = <char*>calloc( sizeof(char), self.nbytes+1 )
- memcpy( cpy, self.data, self.nbytes+1)
- for x from 0 <= x < self.nbytes:
- if cpy[x] == '\0': cpy[x] = '\t'
- result = cpy
- free(cpy)
- return result
-
- def invert( self, int lcontig ):
- '''invert coordinates to negative strand coordinates
-
- This method will only act if the feature is on the
- negative strand.'''
-
- if self.strand[0] == '-':
- start = min(self.start, self.end)
- end = max(self.start, self.end)
- self.start, self.end = lcontig - end, lcontig - start
+ cdef char * s
+ cdef int len
- def keys( self ):
- '''return a list of attributes defined in this entry.'''
- r = self.attributes
- return [ x.strip().split(" ")[0] for x in r.split(";") if x.strip() != '' ]
+ # Getting the metachar is a pain as ti_index_t is incomplete type.
+ # simply use '#' for now.
+ s = ti_read(self.tabixfile, self.iterator, &len)
+ if s == NULL: raise StopIteration
+ # stop at first non-header line
+ if s[0] != '#': raise StopIteration
- def __getitem__(self, item):
- return self.__getattr__( item )
+ return s
def __dealloc__(self):
- if self.data != NULL:
- free(self.data)
- if self.hasOwnAttributes:
- free(self.attributes)
-
- def __getattr__(self, item ):
- """Generic lookup of attribute from GFF/GTF attributes
- Only called if there *isn't* an attribute with this name
- """
- cdef char * start
- cdef char * query
- cdef char * cpy
- cdef char * end
- cdef int l
- query = item
-
- start = strstr( self.attributes, query)
- if start == NULL:
- raise AttributeError("'GTFProxy' has no attribute '%s'" % item )
-
- start += strlen(query) + 1
- # skip gaps before
- while start[0] == " ": start += 1
- if start[0] == '"':
- start += 1
- end = start
- while end[0] != '\0' and end[0] != '"': end += 1
- l = end - start + 1
- cpy = <char*>calloc( l, sizeof(char ) )
- memcpy( cpy, start, l )
- cpy[l-1] = '\0'
- result = cpy
- free(cpy)
- return result
- else:
- return start
+ if <void*>self.iterator != NULL:
+ ti_iter_destroy(self.iterator)
- def setAttribute( self, name, value ):
- '''convenience method to set an attribute.'''
- r = self.asDict()
- r[name] = value
- self.fromDict( r )
+#########################################################
+#########################################################
+#########################################################
cdef class Parser:
pass
cdef class asTuple(Parser):
- '''converts a :term:`tabix row` into a python tuple.'''
+ '''converts a :term:`tabix row` into a python tuple.
+
+ Access is by numeric index.
+ '''
def __call__(self, char * buffer, int len):
- cdef TupleProxy r
- r = TupleProxy()
+ cdef TabProxies.TupleProxy r
+ r = TabProxies.TupleProxy()
# need to copy - there were some
# persistence issues with "present"
r.copy( buffer, len )
return r
cdef class asGTF(Parser):
- '''converts a :term:`tabix row` into a GTF record.'''
+ '''converts a :term:`tabix row` into a GTF record with the following
+ fields:
+
+ contig
+ contig
+ feature
+ feature
+ source
+ source
+ start
+ genomic start coordinate (0-based)
+ end
+ genomic end coordinate plus one (0-based)
+ score
+ feature score
+ strand
+ strand
+ frame
+ frame
+ attributes
+ attribute string.
+
+ GTF formatted entries also defined the attributes:
+
+ gene_id
+ the gene identifier
+ transcript_ind
+ the transcript identifier
+
+ '''
+ def __call__(self, char * buffer, int len):
+ cdef TabProxies.GTFProxy r
+ r = TabProxies.GTFProxy()
+ r.copy( buffer, len )
+ return r
+
+cdef class asBed( Parser ):
+ '''converts a :term:`tabix row` into a bed record
+ with the following fields:
+
+ contig
+ contig
+ start
+ genomic start coordinate (zero-based)
+ end
+ genomic end coordinate plus one (zero-based)
+ name
+ name of feature.
+ score
+ score of feature
+ strand
+ strand of feature
+ thickStart
+ thickStart
+ thickEnd
+ thickEnd
+ itemRGB
+ itemRGB
+ blockCount
+ number of bocks
+ blockSizes
+ ',' separated string of block sizes
+ blockStarts
+ ',' separated string of block genomic start positions
+
+ Only the first three fields are required. Additional
+ fields are optional, but if one is defined, all the preceeding
+ need to be defined as well.
+
+ '''
def __call__(self, char * buffer, int len):
- cdef GTFProxy r
- r = GTFProxy()
+ cdef TabProxies.BedProxy r
+ r = TabProxies.BedProxy()
r.copy( buffer, len )
return r
+cdef class asVCF( Parser ):
+ '''converts a :term:`tabix row` into a VCF record with
+ the following fields:
+
+ contig
+ contig
+ pos
+ chromosomal position, zero-based
+ id
+ id
+ ref
+ reference
+ alt
+ alt
+ qual
+ qual
+ filter
+ filter
+ info
+ info
+ format
+ format specifier.
+
+ Access to genotypes is via index::
+
+ contig = vcf.contig
+ first_sample_genotype = vcf[0]
+ second_sample_genotype = vcf[1]
+
+ '''
+ def __call__(self, char * buffer, int len ):
+ cdef TabProxies.VCFProxy r
+ r = TabProxies.VCFProxy()
+ r.copy( buffer, len )
+ return r
+
+#########################################################
+#########################################################
+#########################################################
cdef class TabixIteratorParsed:
"""iterates over mapped reads in a region.
+
+ Returns parsed data.
"""
-
- cdef ti_iter_t iterator
- cdef tabix_t * tabixfile
- cdef Parser parser
def __cinit__(self,
Tabixfile tabixfile,
cdef char * s
cdef int len
- s = ti_read(self.tabixfile, self.iterator, &len)
- if s == NULL: raise StopIteration
+ while 1:
+ s = ti_read(self.tabixfile, self.iterator, &len)
+ if s == NULL: raise StopIteration
+ # todo: read metachar from configuration
+ if s[0] != '#': break
+
return self.parser(s, len)
def __dealloc__(self):
ti_iter_destroy(self.iterator)
def tabix_compress( filename_in,
- filename_out,
- force = False ):
-
+ filename_out,
+ force = False ):
'''
compress *filename_in* writing the output to *filename_out*.
WINDOW_SIZE = 64 * 1024
- fp = bgzf_open( filename_out, "w")
+ fn = _force_bytes(filename_out)
+ fp = bgzf_open( fn, "w")
if fp == NULL:
raise IOError( "could not open '%s' for writing" )
- fd_src = open(filename_in, O_RDONLY)
+ fn = _force_bytes(filename_in)
+ fd_src = open(fn, O_RDONLY)
if fd_src == 0:
raise IOError( "could not open '%s' for reading" )
buffer = malloc(WINDOW_SIZE)
+ c = 1
while c > 0:
c = read(fd_src, buffer, WINDOW_SIZE)
if not os.path.exists(filename): raise IOError("No such file '%s'" % filename)
if not filename.endswith(".gz"):
-
tabix_compress( filename, filename + ".gz", force = force )
os.unlink( filename )
filename += ".gz"
cdef ti_conf_t conf
conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
- ti_index_build( filename, &conf)
+ fn = _my_encodeFilename( filename )
+ ti_index_build( fn, &conf)
return filename
+
+#########################################################
+#########################################################
+#########################################################
+## Iterators for parsing through unindexed files.
+#########################################################
+ctypedef class tabix_inplace_iterator:
+ '''iterate over ``infile``.
+
+ This iterator is not safe. If the :meth:`__next__()` method is called
+ after ``infile`` is closed, the result is undefined (see ``fclose()``).
+
+ The iterator might either raise a StopIteration or segfault.
+ '''
+
+
+ def __cinit__(self, infile, int buffer_size = 65536 ):
+
+ cdef int fd = PyObject_AsFileDescriptor( infile )
+ if fd == -1: raise ValueError( "I/O operation on closed file." )
+ self.infile = fdopen( fd, 'r')
+
+ if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
+
+ self.buffer = <char*>malloc( buffer_size )
+ self.size = buffer_size
+
+ def __iter__(self):
+ return self
+
+ cdef __cnext__(self):
+
+ cdef char * b
+ cdef size_t nbytes
+ b = self.buffer
+ r = self.Parser()
+
+ while not feof( self.infile ):
+ nbytes = getline( &b, &self.size, self.infile)
+
+ # stop at first error or eof
+ if (nbytes == -1): break
+ # skip comments
+ if (b[0] == '#'): continue
+
+ # skip empty lines
+ if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
+
+ # make sure that entry is complete
+ if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
+ result = b
+ raise ValueError( "incomplete line at %s" % result )
+
+ # make sure that this goes fully through C
+ # otherwise buffer is copied to/from a
+ # Python object causing segfaults as
+ # the wrong memory is freed
+ r.present( b, nbytes )
+ return r
+
+ raise StopIteration
+
+ def __dealloc__(self):
+ free(self.buffer)
+
+ def __next__(self):
+ return self.__cnext__()
+
+ctypedef class tabix_copy_iterator:
+ '''iterate over ``infile``.
+
+ This iterator is not save. If the :meth:`__next__()` method is called
+ after ``infile`` is closed, the result is undefined (see ``fclose()``).
+
+ The iterator might either raise a StopIteration or segfault.
+ '''
+
+ def __cinit__(self, infile, Parser parser ):
+
+ cdef int fd = PyObject_AsFileDescriptor( infile )
+ if fd == -1: raise ValueError( "I/O operation on closed file." )
+ self.infile = fdopen( fd, 'r')
+ if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
+ self.parser = parser
+
+ def __iter__(self):
+ return self
+
+ cdef __cnext__(self):
+
+ cdef char * b
+ cdef size_t nbytes
+ cdef int x
+
+ b = NULL
+
+ while not feof( self.infile ):
+
+ # getline allocates on demand
+ # return number of characters read excluding null byte
+ nbytes = getline( &b, &nbytes, self.infile)
+ # stop at first error
+ if (nbytes == -1): break
+ # skip comments
+ if (b[0] == '#'): continue
+
+ # skip empty lines
+ if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
+
+ # make sure that entry is complete
+ if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
+ result = b
+ free(b)
+ raise ValueError( "incomplete line at %s" % result )
+
+ # make sure that this goes fully through C
+ # otherwise buffer is copied to/from a
+ # Python object causing segfaults as
+ # the wrong memory is freed
+ # -1 to remove the new-line character
+ return self.parser(b, nbytes)
+
+ free(b)
+ raise StopIteration
+
+ def __next__(self):
+ return self.__cnext__()
+
+class tabix_generic_iterator:
+ '''iterate over ``infile``.
+
+ Permits the use of file-like objects for example from the gzip module.
+ '''
+ def __init__(self, infile, parser ):
+
+ self.infile = infile
+ if self.infile.closed: raise ValueError( "I/O operation on closed file." )
+ self.parser = parser
+
+ def __iter__(self):
+ return self
+
+ # cython version - required for python 3
+ def __next__(self):
+
+ cdef char * b, * cpy
+ cdef size_t nbytes
+ while 1:
+
+ line = self.infile.readline()
+ if not line: break
+
+ s = _force_bytes( line )
+ b = s
+ nbytes = len( line )
+ assert b[nbytes] == '\0'
+
+ # skip comments
+ if (b[0] == '#'): continue
+
+ # skip empty lines
+ if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
+
+ # make sure that entry is complete
+ if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
+ raise ValueError( "incomplete line at %s" % line )
+
+ # create a copy
+ cpy = <char*>malloc(nbytes+1)
+ if cpy == NULL: raise MemoryError()
+ memcpy( cpy, b, nbytes+1)
+
+ return self.parser(cpy, nbytes)
+
+ raise StopIteration
+
+ # python version - required for python 2.7
+ def next(self):
+ return self.__next__()
+
+def tabix_iterator( infile, parser ):
+ """return an iterator over all entries in a file."""
+ return tabix_generic_iterator( infile, parser )
+ # file objects can use C stdio
+ # used to be: isinstance( infile, file):
+ # if PYTHON3:
+ # if isinstance( infile, io.IOBase ):
+ # return tabix_copy_iterator( infile, parser )
+ # else:
+ # return tabix_generic_iterator( infile, parser )
+ # else:
+# if isinstance( infile, file ):
+# return tabix_copy_iterator( infile, parser )
+# else:
+# return tabix_generic_iterator( infile, parser )
__all__ = ["tabix_index",
"tabix_compress",
"Tabixfile",
"asTuple",
"asGTF",
+ "asVCF",
+ "asBed",
+ "tabix_iterator",
+ "tabix_inplace_iterator"
]