X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=pysam.git;a=blobdiff_plain;f=pysam%2FTabProxies.pyx;fp=pysam%2FTabProxies.pyx;h=396445b9a4bfd3e913afc9a0ca58bc207f23a928;hp=6e3a866d07c212b148cb7c4835a862ba85842ced;hb=68c074fc81858150ca7447a88e731eec96be6378;hpb=768881ffd9d33e3c5fa00dd9ea6f488f4f0700b3 diff --git a/pysam/TabProxies.pyx b/pysam/TabProxies.pyx index 6e3a866..396445b 100644 --- a/pysam/TabProxies.pyx +++ b/pysam/TabProxies.pyx @@ -1,5 +1,69 @@ -import types -from cpython cimport PyString_FromStringAndSize, PyString_AsString, PyString_AS_STRING +import types, sys + +from cpython.version cimport PY_MAJOR_VERSION + +from cpython cimport PyErr_SetString, PyBytes_Check, PyUnicode_Check, PyBytes_FromStringAndSize + +cdef from_string_and_size(char* s, size_t length): + if PY_MAJOR_VERSION < 3: + return s[:length] + else: + return s[:length].decode("ascii") + +# filename encoding (copied from lxml.etree.pyx) +cdef str _FILENAME_ENCODING +_FILENAME_ENCODING = sys.getfilesystemencoding() +if _FILENAME_ENCODING is None: + _FILENAME_ENCODING = sys.getdefaultencoding() +if _FILENAME_ENCODING is None: + _FILENAME_ENCODING = 'ascii' + +cdef bytes _my_encodeFilename(object filename): + u"""Make sure a filename is 8-bit encoded (or None). + """ + if filename is None: + return None + elif PyBytes_Check(filename): + return filename + elif PyUnicode_Check(filename): + return filename.encode(_FILENAME_ENCODING) + else: + raise TypeError, u"Argument must be string or unicode." + +cdef bytes _force_bytes(object s): + u"""convert string or unicode object to bytes, assuming ascii encoding. + """ + if PY_MAJOR_VERSION < 3: + return s + elif s is None: + return None + elif PyBytes_Check(s): + return s + elif PyUnicode_Check(s): + return s.encode('ascii') + else: + raise TypeError, u"Argument must be string, bytes or unicode." + +cdef inline bytes _force_cmdline_bytes(object s): + return _force_bytes(s) + +cdef _charptr_to_str(char* s): + if PY_MAJOR_VERSION < 3: + return s + else: + return s.decode("ascii") + +cdef _force_str(object s): + """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)""" + if s is None: + return None + if PY_MAJOR_VERSION < 3: + return s + elif PyBytes_Check(s): + return s.decode('ascii') + else: + # assume unicode + return s cdef char * nextItem( char * buffer ): cdef char * pos @@ -25,6 +89,7 @@ cdef class TupleProxy: Access to individual fields is via the [] operator. Only read-only access is implemented. + ''' def __cinit__(self ): @@ -65,7 +130,9 @@ cdef class TupleProxy: self.update( buffer, nbytes ) cdef copy( self, char * buffer, size_t nbytes ): - '''start presenting buffer. + '''start presenting buffer of size *nbytes*. + + Buffer is a '\0'-terminated string without the '\n'. Take a copy of buffer. ''' @@ -86,21 +153,36 @@ cdef class TupleProxy: cdef update( self, char * buffer, size_t nbytes ): '''update internal data. + *buffer* is a \0 terminated string. + + *nbytes* is the number of bytes in buffer (excluding + the \0) + Update starts work in buffer, thus can be used to collect any number of fields until nbytes is exhausted. - If max_fields is set, the number of fields is initialized to max_fields. - + If max_fields is set, the number of fields is initialized to + max_fields. ''' cdef char * pos cdef char * old_pos cdef int field cdef int max_fields, x + + assert strlen(buffer) == nbytes if buffer[nbytes] != 0: raise ValueError( "incomplete line at %s" % buffer ) + ################################# + # remove line breaks and feeds and update number of bytes + x = nbytes - 1 + while x > 0 and (buffer[x] == '\n' or buffer[x] == '\r'): + buffer[x] = '\0' + x -= 1 + self.nbytes = x + 1 + ################################# # clear data if self.fields != NULL: free(self.fields) @@ -133,7 +215,7 @@ cdef class TupleProxy: pos += 1 self.fields[field] = pos field += 1 - if field >= max_fields: + if field > max_fields: raise ValueError("row too large - more than %i fields" % max_fields ) nbytes -= pos - old_pos if nbytes < 0: break @@ -149,7 +231,7 @@ cdef class TupleProxy: i += self.offset if i >= self.nfields: raise IndexError( "list index out of range %i >= %i" % (i, self.nfields )) - return self.fields[i] + return self.fields[i] def __getitem__( self, key ): if type(key) == int: return self._getindex( key ) @@ -177,7 +259,8 @@ cdef class TupleProxy: return # conversion with error checking - cdef char * tmp = PyString_AsString( value ) + value = _force_bytes(value) + cdef char * tmp = value self.fields[idx] = malloc( (strlen( tmp ) + 1) * sizeof(char) ) if self.fields[idx] == NULL: raise ValueError("out of memory" ) @@ -213,7 +296,10 @@ cdef class TupleProxy: # copy and replace \0 bytes with \t characters if self.is_modified: # todo: treat NULL values - return "\t".join( [StrOrEmpty( self.fields[x]) for x in xrange(0, self.nfields ) ] ) + result = [] + for x in xrange( 0, self.nfields ): + result.append( StrOrEmpty( self.fields[x]).decode('ascii') ) + return "\t".join( result ) else: cpy = calloc( sizeof(char), self.nbytes+1 ) if cpy == NULL: @@ -221,9 +307,9 @@ cdef class TupleProxy: memcpy( cpy, self.data, self.nbytes+1) for x from 0 <= x < self.nbytes: if cpy[x] == '\0': cpy[x] = '\t' - result = PyString_FromStringAndSize(cpy, self.nbytes) + result = cpy[:self.nbytes] free(cpy) - return result + return result.decode('ascii') def toDot( v ): '''convert value to '.' if None''' @@ -248,114 +334,88 @@ cdef class GTFProxy( TupleProxy ): The only exception is the attributes field when set from a dictionary - this field will manage its own memory. - ''' def __cinit__(self ): # automatically calls TupleProxy.__cinit__ self.hasOwnAttributes = False + self._attributes = NULL def __dealloc__(self): # automatically calls TupleProxy.__dealloc__ if self.hasOwnAttributes: - free(self.attributes) + free(self._attributes) cdef int getMaxFields( self, size_t nbytes ): '''return max number of fields.''' return 9 - cdef update( self, char * buffer, size_t nbytes ): - '''update internal data. - - nbytes does not include the terminal '\0'. - ''' - cdef int end - cdef char * cstart, * cend, * cscore - self.contig = buffer - cdef char * pos - - if buffer[nbytes] != 0: - raise ValueError( "incomplete line at %s" % buffer ) - - self.source = pos = nextItem( buffer ) - self.feature = pos = nextItem( pos ) - cstart = pos = nextItem( pos ) - cend = pos = nextItem( pos ) - self.score = pos = nextItem( pos ) - self.strand = pos = nextItem( pos ) - self.frame = pos = nextItem( pos ) - self.attributes = pos = nextItem( pos ) - - self.start = atoi( cstart ) - 1 - self.end = atoi( cend ) - self.nfields = 9 - property contig: '''contig of feature.''' - def __get__( self ): return self.contig - def __set__( self, value ): - self.is_modified = True - self.contig = value - - property feature: - '''feature name.''' - def __get__( self ): return self.feature - def __set__( self, value ): - self.is_modified = True - self.feature = value + def __get__( self ): return self._getindex( 0 ) + def __set__( self, value ): self._setindex( 0, value ) property source: '''feature source.''' - def __get__( self ): return self.source - def __set__( self, value ): - self.is_modified = True - self.source = value + def __get__( self ): return self._getindex( 1 ) + def __set__( self, value ): self._setindex( 1, value ) + + property feature: + '''feature name.''' + def __get__( self ): return self._getindex( 2 ) + def __set__( self, value ): self._setindex( 2, value ) property start: '''feature start (in 0-based open/closed coordinates).''' - def __get__( self ): return self.start - def __set__( self, value ): - self.is_modified = True - self.start = value + def __get__( self ): return int( self._getindex( 3 )) - 1 + def __set__( self, value ): self._setindex( 3, str(value+1) ) property end: '''feature end (in 0-based open/closed coordinates).''' - def __get__( self ): return self.end - def __set__( self, value ): - self.is_modified = True - self.end = value + def __get__( self ): return int( self._getindex( 4 ) ) + def __set__( self, value ): self._setindex( 4, str(value) ) property score: '''feature score.''' def __get__( self ): - if self.score[0] == '.' and self.score[1] == '\0' : + v = self._getindex(5) + if v == "" or v[0] == '.': return None else: - return atof(self.score) - def __set__( self, value ): - self.is_modified = True - self.score = value + return float(v) + + def __set__( self, value ): self._setindex( 5, value ) property strand: '''feature strand.''' - def __get__( self ): return self.strand - def __set__( self, value ): - self.is_modified = True - self.strand = value + def __get__( self ): return self._getindex( 6 ) + def __set__( self, value ): self._setindex( 6, value ) property frame: '''feature frame.''' - def __get__( self ): return self.frame - def __set__( self, value ): - self.is_modified = True - self.frame = value + def __get__( self ): return self._getindex( 7 ) + def __set__( self, value ): self._setindex( 7, value ) property attributes: '''feature attributes (as a string).''' - def __get__( self ): return self.attributes + def __get__( self ): + if self.hasOwnAttributes: + return self._attributes + else: + return self._getindex( 8 ) def __set__( self, value ): - self.is_modified = True - self.attributes = value + if self.hasOwnAttributes: + free(self._attributes) + self._attributes = NULL + self.hasOwnAttributes = False + self._setindex(8, value ) + + cdef char * getAttributes( self ): + '''return pointer to attributes.''' + if self.hasOwnAttributes: + return self._attributes + else: + return self.fields[ 8 ] def asDict( self ): """parse attributes - return as dict @@ -398,12 +458,12 @@ cdef class GTFProxy( TupleProxy ): cdef int l # clean up if this field is set twice - if self.hasOwnAttributes: - free(self.attributes) + if self.hasOwnAttributes: + free(self._attributes) aa = [] for k,v in d.items(): - if type(v) == types.StringType: + if type(v) in types.StringTypes: aa.append( '%s "%s"' % (k,v) ) else: aa.append( '%s %s' % (k,str(v)) ) @@ -411,10 +471,10 @@ cdef class GTFProxy( TupleProxy ): a = "; ".join( aa ) + ";" p = a l = len(a) - self.attributes = calloc( l + 1, sizeof(char) ) - if self.attributes == NULL: + self._attributes = calloc( l + 1, sizeof(char) ) + if self._attributes == NULL: raise ValueError("out of memory" ) - memcpy( self.attributes, p, l ) + memcpy( self._attributes, p, l ) self.hasOwnAttributes = True self.is_modified = True @@ -461,28 +521,46 @@ cdef class GTFProxy( TupleProxy ): Only called if there *isn't* an attribute with this name """ cdef char * start - cdef char * query + cdef char * query cdef char * cpy cdef char * end cdef int l - query = item - - start = strstr( self.attributes, query) + + # + # important to use the getAttributes function. + # Using the self.attributes property to access + # the attributes caused a hard-to-trace bug + # in which fields in the attribute string were + # set to 0. + # Running through valgrind complained that + # memory was accessed in the memory field + # that has been released. It is not clear + # why this happened and might be a cython bug + # (Version 0.16). The valgrind warnings + # disappeard after accessing the C data structures + # directly and so did the bug. + cdef char * attributes = self.getAttributes() + + r = _force_bytes(item) + query = r + start = strstr( attributes, query) + if start == NULL: raise AttributeError("'GTFProxy' has no attribute '%s'" % item ) start += strlen(query) + 1 # skip gaps before while start[0] == ' ': start += 1 + if start[0] == '"': start += 1 end = start while end[0] != '\0' and end[0] != '"': end += 1 l = end - start - result = PyString_FromStringAndSize( start, l ) + result = _force_str( PyBytes_FromStringAndSize( start, l ) ) return result else: - return start + return _force_str( start ) def setAttribute( self, name, value ): '''convenience method to set an attribute.''' @@ -515,18 +593,18 @@ cdef class BedProxy( NamedTupleProxy ): This class represents a GTF entry for fast read-access. ''' map_key2field = { - 'contig' : (0, str), + 'contig' : (0, bytes), 'start' : (1, int), 'end' : (2, int), - 'name' : (3, str), + 'name' : (3, bytes), 'score' : (4, float), - 'strand' : (5, str), - 'thickStart' : (6,int ), - 'thickEnd' : (7,int), - 'itemRGB' : (8,str), - 'blockCount': (9,int), - 'blockSizes': (10,str), - 'blockStarts': (11,str), } + 'strand' : (5, bytes), + 'thickStart' : (6, int ), + 'thickEnd' : (7, int), + 'itemRGB' : (8, bytes), + 'blockCount': (9, int), + 'blockSizes': (10, bytes), + 'blockStarts': (11, bytes), } cdef int getMaxFields( self, size_t nbytes ): '''return max number of fields.''' @@ -581,15 +659,15 @@ cdef class VCFProxy( NamedTupleProxy ): The genotypes are accessed via index. ''' map_key2field = { - 'contig' : (0, str), + 'contig' : (0, bytes), 'pos' : (1, int), - 'id' : (2, str), - 'ref' : (3, str), - 'alt' : (4, str), - 'qual' : (5, str), - 'filter' : (6,str), - 'info' : (7,str), - 'format' : (8,str) } + 'id' : (2, bytes), + 'ref' : (3, bytes), + 'alt' : (4, bytes), + 'qual' : (5, bytes), + 'filter' : (6, bytes), + 'info' : (7, bytes), + 'format' : (8, bytes) } def __cinit__(self ): # automatically calls TupleProxy.__cinit__ @@ -608,6 +686,7 @@ cdef class VCFProxy( NamedTupleProxy ): self.pos = atoi( self.fields[1] ) - 1 def __len__(self): + '''return number of genotype fields.''' return max(0, self.nfields - 9) property pos: