-import types
-from cpython cimport PyString_FromStringAndSize, PyString_AsString, PyString_AS_STRING
+import types, sys
+
+from cpython.version cimport PY_MAJOR_VERSION
+
+from cpython cimport PyErr_SetString, PyBytes_Check, PyUnicode_Check, PyBytes_FromStringAndSize
+
+cdef from_string_and_size(char* s, size_t length):
+ if PY_MAJOR_VERSION < 3:
+ return s[:length]
+ else:
+ return s[:length].decode("ascii")
+
+# filename encoding (copied from lxml.etree.pyx)
+cdef str _FILENAME_ENCODING
+_FILENAME_ENCODING = sys.getfilesystemencoding()
+if _FILENAME_ENCODING is None:
+ _FILENAME_ENCODING = sys.getdefaultencoding()
+if _FILENAME_ENCODING is None:
+ _FILENAME_ENCODING = 'ascii'
+
+cdef bytes _my_encodeFilename(object filename):
+ u"""Make sure a filename is 8-bit encoded (or None).
+ """
+ if filename is None:
+ return None
+ elif PyBytes_Check(filename):
+ return filename
+ elif PyUnicode_Check(filename):
+ return filename.encode(_FILENAME_ENCODING)
+ else:
+ raise TypeError, u"Argument must be string or unicode."
+
+cdef bytes _force_bytes(object s):
+ u"""convert string or unicode object to bytes, assuming ascii encoding.
+ """
+ if PY_MAJOR_VERSION < 3:
+ return s
+ elif s is None:
+ return None
+ elif PyBytes_Check(s):
+ return s
+ elif PyUnicode_Check(s):
+ return s.encode('ascii')
+ else:
+ raise TypeError, u"Argument must be string, bytes or unicode."
+
+cdef inline bytes _force_cmdline_bytes(object s):
+ return _force_bytes(s)
+
+cdef _charptr_to_str(char* s):
+ if PY_MAJOR_VERSION < 3:
+ return s
+ else:
+ return s.decode("ascii")
+
+cdef _force_str(object s):
+ """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)"""
+ if s is None:
+ return None
+ if PY_MAJOR_VERSION < 3:
+ return s
+ elif PyBytes_Check(s):
+ return s.decode('ascii')
+ else:
+ # assume unicode
+ return s
cdef char * nextItem( char * buffer ):
cdef char * pos
Access to individual fields is via the [] operator.
Only read-only access is implemented.
+
'''
def __cinit__(self ):
self.update( buffer, nbytes )
cdef copy( self, char * buffer, size_t nbytes ):
- '''start presenting buffer.
+ '''start presenting buffer of size *nbytes*.
+
+ Buffer is a '\0'-terminated string without the '\n'.
Take a copy of buffer.
'''
cdef update( self, char * buffer, size_t nbytes ):
'''update internal data.
+ *buffer* is a \0 terminated string.
+
+ *nbytes* is the number of bytes in buffer (excluding
+ the \0)
+
Update starts work in buffer, thus can be used
to collect any number of fields until nbytes
is exhausted.
- If max_fields is set, the number of fields is initialized to max_fields.
-
+ If max_fields is set, the number of fields is initialized to
+ max_fields.
'''
cdef char * pos
cdef char * old_pos
cdef int field
cdef int max_fields, x
+
+ assert strlen(buffer) == nbytes
if buffer[nbytes] != 0:
raise ValueError( "incomplete line at %s" % buffer )
+ #################################
+ # remove line breaks and feeds and update number of bytes
+ x = nbytes - 1
+ while x > 0 and (buffer[x] == '\n' or buffer[x] == '\r'):
+ buffer[x] = '\0'
+ x -= 1
+ self.nbytes = x + 1
+
#################################
# clear data
if self.fields != NULL: free(self.fields)
pos += 1
self.fields[field] = pos
field += 1
- if field >= max_fields:
+ if field > max_fields:
raise ValueError("row too large - more than %i fields" % max_fields )
nbytes -= pos - old_pos
if nbytes < 0: break
self.nfields = field
- def __getitem__( self, key ):
-
- cdef int i = key
+ def _getindex( self, int index ):
+ '''return item at idx index'''
+ cdef int i = index
if i < 0: i += self.nfields
if i < 0: raise IndexError( "list index out of range" )
i += self.offset
if i >= self.nfields:
- raise IndexError( "list index out of range" )
- return self.fields[i]
+ raise IndexError( "list index out of range %i >= %i" % (i, self.nfields ))
+ return self.fields[i]
+
+ def __getitem__( self, key ):
+ if type(key) == int: return self._getindex( key )
+ # slice object
+ start, end, step = key.indices( self.nfields )
+ result = []
+ for index in range( start, end, step ):
+ result.append( self._getindex( index ) )
+ return result
def _setindex( self, index, value ):
'''set item at idx index.'''
return
# conversion with error checking
- cdef char * tmp = PyString_AsString( value )
+ value = _force_bytes(value)
+ cdef char * tmp = <char*>value
self.fields[idx] = <char*>malloc( (strlen( tmp ) + 1) * sizeof(char) )
if self.fields[idx] == NULL:
raise ValueError("out of memory" )
# copy and replace \0 bytes with \t characters
if self.is_modified:
# todo: treat NULL values
- return "\t".join( [StrOrEmpty( self.fields[x]) for x in xrange(0, self.nfields ) ] )
+ result = []
+ for x in xrange( 0, self.nfields ):
+ result.append( StrOrEmpty( self.fields[x]).decode('ascii') )
+ return "\t".join( result )
else:
cpy = <char*>calloc( sizeof(char), self.nbytes+1 )
if cpy == NULL:
memcpy( cpy, self.data, self.nbytes+1)
for x from 0 <= x < self.nbytes:
if cpy[x] == '\0': cpy[x] = '\t'
- result = PyString_FromStringAndSize(cpy, self.nbytes)
+ result = cpy[:self.nbytes]
free(cpy)
- return result
+ return result.decode('ascii')
def toDot( v ):
'''convert value to '.' if None'''
The only exception is the attributes field when set from
a dictionary - this field will manage its own memory.
-
'''
def __cinit__(self ):
# automatically calls TupleProxy.__cinit__
self.hasOwnAttributes = False
+ self._attributes = NULL
def __dealloc__(self):
# automatically calls TupleProxy.__dealloc__
if self.hasOwnAttributes:
- free(self.attributes)
+ free(self._attributes)
cdef int getMaxFields( self, size_t nbytes ):
'''return max number of fields.'''
return 9
- cdef update( self, char * buffer, size_t nbytes ):
- '''update internal data.
-
- nbytes does not include the terminal '\0'.
- '''
- cdef int end
- cdef char * cstart, * cend, * cscore
- self.contig = buffer
- cdef char * pos
-
- if buffer[nbytes] != 0:
- raise ValueError( "incomplete line at %s" % buffer )
-
- self.source = pos = nextItem( buffer )
- self.feature = pos = nextItem( pos )
- cstart = pos = nextItem( pos )
- cend = pos = nextItem( pos )
- self.score = pos = nextItem( pos )
- self.strand = pos = nextItem( pos )
- self.frame = pos = nextItem( pos )
- self.attributes = pos = nextItem( pos )
-
- self.start = atoi( cstart ) - 1
- self.end = atoi( cend )
-
property contig:
'''contig of feature.'''
- def __get__( self ): return self.contig
- def __set__( self, value ):
- self.is_modified = True
- self.contig = value
-
- property feature:
- '''feature name.'''
- def __get__( self ): return self.feature
- def __set__( self, value ):
- self.is_modified = True
- self.feature = value
+ def __get__( self ): return self._getindex( 0 )
+ def __set__( self, value ): self._setindex( 0, value )
property source:
'''feature source.'''
- def __get__( self ): return self.source
- def __set__( self, value ):
- self.is_modified = True
- self.source = value
+ def __get__( self ): return self._getindex( 1 )
+ def __set__( self, value ): self._setindex( 1, value )
+
+ property feature:
+ '''feature name.'''
+ def __get__( self ): return self._getindex( 2 )
+ def __set__( self, value ): self._setindex( 2, value )
property start:
'''feature start (in 0-based open/closed coordinates).'''
- def __get__( self ): return self.start
- def __set__( self, value ):
- self.is_modified = True
- self.start = value
+ def __get__( self ): return int( self._getindex( 3 )) - 1
+ def __set__( self, value ): self._setindex( 3, str(value+1) )
property end:
'''feature end (in 0-based open/closed coordinates).'''
- def __get__( self ): return self.end
- def __set__( self, value ):
- self.is_modified = True
- self.end = value
+ def __get__( self ): return int( self._getindex( 4 ) )
+ def __set__( self, value ): self._setindex( 4, str(value) )
property score:
'''feature score.'''
def __get__( self ):
- if self.score[0] == '.' and self.score[1] == '\0' :
+ v = self._getindex(5)
+ if v == "" or v[0] == '.':
return None
else:
- return atof(self.score)
- def __set__( self, value ):
- self.is_modified = True
- self.score = value
+ return float(v)
+
+ def __set__( self, value ): self._setindex( 5, value )
property strand:
'''feature strand.'''
- def __get__( self ): return self.strand
- def __set__( self, value ):
- self.is_modified = True
- self.strand = value
+ def __get__( self ): return self._getindex( 6 )
+ def __set__( self, value ): self._setindex( 6, value )
property frame:
'''feature frame.'''
- def __get__( self ): return self.frame
- def __set__( self, value ):
- self.is_modified = True
- self.frame = value
+ def __get__( self ): return self._getindex( 7 )
+ def __set__( self, value ): self._setindex( 7, value )
property attributes:
'''feature attributes (as a string).'''
- def __get__( self ): return self.attributes
+ def __get__( self ):
+ if self.hasOwnAttributes:
+ return self._attributes
+ else:
+ return self._getindex( 8 )
def __set__( self, value ):
- self.is_modified = True
- self.attributes = value
+ if self.hasOwnAttributes:
+ free(self._attributes)
+ self._attributes = NULL
+ self.hasOwnAttributes = False
+ self._setindex(8, value )
+
+ cdef char * getAttributes( self ):
+ '''return pointer to attributes.'''
+ if self.hasOwnAttributes:
+ return self._attributes
+ else:
+ return self.fields[ 8 ]
def asDict( self ):
"""parse attributes - return as dict
cdef int l
# clean up if this field is set twice
- if self.hasOwnAttributes:
- free(self.attributes)
+ if self.hasOwnAttributes:
+ free(self._attributes)
aa = []
for k,v in d.items():
- if type(v) == types.StringType:
+ if type(v) in types.StringTypes:
aa.append( '%s "%s"' % (k,v) )
else:
aa.append( '%s %s' % (k,str(v)) )
a = "; ".join( aa ) + ";"
p = a
l = len(a)
- self.attributes = <char *>calloc( l + 1, sizeof(char) )
- if self.attributes == NULL:
+ self._attributes = <char *>calloc( l + 1, sizeof(char) )
+ if self._attributes == NULL:
raise ValueError("out of memory" )
- memcpy( self.attributes, p, l )
+ memcpy( self._attributes, p, l )
self.hasOwnAttributes = True
self.is_modified = True
r = self.attributes
return [ x.strip().split(" ")[0] for x in r.split(";") if x.strip() != '' ]
- def __getitem__(self, item):
- return self.__getattr__( item )
+ def __getitem__(self, key):
+ return self.__getattr__( key )
def __getattr__(self, item ):
"""Generic lookup of attribute from GFF/GTF attributes
Only called if there *isn't* an attribute with this name
"""
cdef char * start
- cdef char * query
+ cdef char * query
cdef char * cpy
cdef char * end
cdef int l
- query = item
-
- start = strstr( self.attributes, query)
+
+ #
+ # important to use the getAttributes function.
+ # Using the self.attributes property to access
+ # the attributes caused a hard-to-trace bug
+ # in which fields in the attribute string were
+ # set to 0.
+ # Running through valgrind complained that
+ # memory was accessed in the memory field
+ # that has been released. It is not clear
+ # why this happened and might be a cython bug
+ # (Version 0.16). The valgrind warnings
+ # disappeard after accessing the C data structures
+ # directly and so did the bug.
+ cdef char * attributes = self.getAttributes()
+
+ r = _force_bytes(item)
+ query = r
+ start = strstr( attributes, query)
+
if start == NULL:
raise AttributeError("'GTFProxy' has no attribute '%s'" % item )
start += strlen(query) + 1
# skip gaps before
- while start[0] == " ": start += 1
+ while start[0] == ' ': start += 1
+
if start[0] == '"':
start += 1
end = start
while end[0] != '\0' and end[0] != '"': end += 1
- l = end - start + 1
- cpy = <char*>calloc( l, sizeof(char ) )
- if cpy == NULL: raise ValueError("out of memory" )
- memcpy( cpy, start, l )
- cpy[l-1] = '\0'
- result = cpy
- free(cpy)
+ l = end - start
+ result = _force_str( PyBytes_FromStringAndSize( start, l ) )
return result
else:
- return start
+ return _force_str( start )
def setAttribute( self, name, value ):
'''convenience method to set an attribute.'''
This class represents a GTF entry for fast read-access.
'''
map_key2field = {
- 'contig' : (0, str),
+ 'contig' : (0, bytes),
'start' : (1, int),
'end' : (2, int),
- 'name' : (3, str),
+ 'name' : (3, bytes),
'score' : (4, float),
- 'strand' : (5, str),
- 'thickStart' : (6,int ),
- 'thickEnd' : (7,int),
- 'itemRGB' : (8,str),
- 'blockCount': (9,int),
- 'blockSizes': (10,str),
- 'blockStarts': (11,str), }
+ 'strand' : (5, bytes),
+ 'thickStart' : (6, int ),
+ 'thickEnd' : (7, int),
+ 'itemRGB' : (8, bytes),
+ 'blockCount': (9, int),
+ 'blockSizes': (10, bytes),
+ 'blockStarts': (11, bytes), }
cdef int getMaxFields( self, size_t nbytes ):
'''return max number of fields.'''
The genotypes are accessed via index.
'''
map_key2field = {
- 'contig' : (0, str),
+ 'contig' : (0, bytes),
'pos' : (1, int),
- 'id' : (2, str),
- 'ref' : (3, str),
- 'alt' : (4, str),
- 'qual' : (5, str),
- 'filter' : (6,str),
- 'info' : (7,str),
- 'format' : (8,str) }
+ 'id' : (2, bytes),
+ 'ref' : (3, bytes),
+ 'alt' : (4, bytes),
+ 'qual' : (5, bytes),
+ 'filter' : (6, bytes),
+ 'info' : (7, bytes),
+ 'format' : (8, bytes) }
def __cinit__(self ):
# automatically calls TupleProxy.__cinit__
self.contig = self.fields[0]
# vcf counts from 1 - correct here
self.pos = atoi( self.fields[1] ) - 1
-
+
def __len__(self):
+ '''return number of genotype fields.'''
return max(0, self.nfields - 9)
+ property pos:
+ '''feature end (in 0-based open/closed coordinates).'''
+ def __get__( self ):
+ return self.pos
+
def __setattr__(self, key, value ):
'''set attribute.'''
if key == "pos":
cdef int idx
idx, f = self.map_key2field[key]
TupleProxy._setindex(self, idx, str(value) )
-
+