Merge tag 'upstream/0.7'

[pysam.git] / pysam / TabProxies.pyx
diff --git a/pysam/TabProxies.pyx b/pysam/TabProxies.pyx

index 6e3a866d07c212b148cb7c4835a862ba85842ced..396445b9a4bfd3e913afc9a0ca58bc207f23a928 100644 (file)
--- a/pysam/TabProxies.pyx
+++ b/pysam/TabProxies.pyx
@@ -1,5 +1,69 @@
-import types
-from cpython cimport PyString_FromStringAndSize, PyString_AsString, PyString_AS_STRING
+import types, sys
+
+from cpython.version cimport PY_MAJOR_VERSION
+
+from cpython cimport PyErr_SetString, PyBytes_Check, PyUnicode_Check, PyBytes_FromStringAndSize
+
+cdef from_string_and_size(char* s, size_t length):
+    if PY_MAJOR_VERSION < 3:
+        return s[:length]
+    else:
+        return s[:length].decode("ascii")
+
+# filename encoding (copied from lxml.etree.pyx)
+cdef str _FILENAME_ENCODING
+_FILENAME_ENCODING = sys.getfilesystemencoding()
+if _FILENAME_ENCODING is None:
+    _FILENAME_ENCODING = sys.getdefaultencoding()
+if _FILENAME_ENCODING is None:
+    _FILENAME_ENCODING = 'ascii'
+
+cdef bytes _my_encodeFilename(object filename):
+    u"""Make sure a filename is 8-bit encoded (or None).
+    """
+    if filename is None:
+        return None
+    elif PyBytes_Check(filename):
+        return filename
+    elif PyUnicode_Check(filename):
+        return filename.encode(_FILENAME_ENCODING)
+    else:
+        raise TypeError, u"Argument must be string or unicode."
+
+cdef bytes _force_bytes(object s):
+    u"""convert string or unicode object to bytes, assuming ascii encoding.
+    """
+    if PY_MAJOR_VERSION < 3:
+        return s
+    elif s is None:
+        return None
+    elif PyBytes_Check(s):
+        return s
+    elif PyUnicode_Check(s):
+        return s.encode('ascii')
+    else:
+        raise TypeError, u"Argument must be string, bytes or unicode."
+
+cdef inline bytes _force_cmdline_bytes(object s):
+    return _force_bytes(s)
+
+cdef _charptr_to_str(char* s):
+    if PY_MAJOR_VERSION < 3:
+        return s
+    else:
+        return s.decode("ascii")
+
+cdef _force_str(object s):
+    """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)"""
+    if s is None:
+        return None
+    if PY_MAJOR_VERSION < 3:
+        return s
+    elif PyBytes_Check(s):
+        return s.decode('ascii')
+    else:
+        # assume unicode
+        return s
  
  cdef char * nextItem( char * buffer ):
      cdef char * pos
  
  cdef char * nextItem( char * buffer ):
      cdef char * pos
@@ -25,6 +89,7 @@ cdef class TupleProxy:
      Access to individual fields is via the [] operator.
      
      Only read-only access is implemented.
      Access to individual fields is via the [] operator.
      
      Only read-only access is implemented.
+
      '''
  
      def __cinit__(self ): 
      '''
  
      def __cinit__(self ): 
@@ -65,7 +130,9 @@ cdef class TupleProxy:
          self.update( buffer, nbytes )
  
      cdef copy( self, char * buffer, size_t nbytes ):
          self.update( buffer, nbytes )
  
      cdef copy( self, char * buffer, size_t nbytes ):
-        '''start presenting buffer.
+        '''start presenting buffer of size *nbytes*.
+
+        Buffer is a '\0'-terminated string without the '\n'.
  
          Take a copy of buffer.
          '''
  
          Take a copy of buffer.
          '''
@@ -86,21 +153,36 @@ cdef class TupleProxy:
      cdef update( self, char * buffer, size_t nbytes ):
          '''update internal data.
  
      cdef update( self, char * buffer, size_t nbytes ):
          '''update internal data.
  
+        *buffer* is a \0 terminated string.
+
+        *nbytes* is the number of bytes in buffer (excluding
+        the \0)
+
          Update starts work in buffer, thus can be used
          to collect any number of fields until nbytes
          is exhausted.
  
          Update starts work in buffer, thus can be used
          to collect any number of fields until nbytes
          is exhausted.
  
-        If max_fields is set, the number of fields is initialized to max_fields.
-
+        If max_fields is set, the number of fields is initialized to 
+        max_fields.
          '''
          cdef char * pos
          cdef char * old_pos
          cdef int field
          cdef int max_fields, x
          '''
          cdef char * pos
          cdef char * old_pos
          cdef int field
          cdef int max_fields, x
+        
+        assert strlen(buffer) == nbytes
  
          if buffer[nbytes] != 0:
              raise ValueError( "incomplete line at %s" % buffer )
  
  
          if buffer[nbytes] != 0:
              raise ValueError( "incomplete line at %s" % buffer )
  
+        #################################
+        # remove line breaks and feeds and update number of bytes
+        x = nbytes - 1
+        while x > 0 and (buffer[x] == '\n' or buffer[x] == '\r'): 
+            buffer[x] = '\0'
+            x -= 1
+        self.nbytes = x + 1
+
          #################################
          # clear data
          if self.fields != NULL: free(self.fields)
          #################################
          # clear data
          if self.fields != NULL: free(self.fields)
@@ -133,7 +215,7 @@ cdef class TupleProxy:
              pos += 1
              self.fields[field] = pos
              field += 1
              pos += 1
              self.fields[field] = pos
              field += 1
-            if field >= max_fields:
+            if field > max_fields:
                  raise ValueError("row too large - more than %i fields" % max_fields )
              nbytes -= pos - old_pos
              if nbytes < 0: break
                  raise ValueError("row too large - more than %i fields" % max_fields )
              nbytes -= pos - old_pos
              if nbytes < 0: break
@@ -149,7 +231,7 @@ cdef class TupleProxy:
          i += self.offset
          if i >= self.nfields:
              raise IndexError( "list index out of range %i >= %i" % (i, self.nfields ))
          i += self.offset
          if i >= self.nfields:
              raise IndexError( "list index out of range %i >= %i" % (i, self.nfields ))
-        return self.fields[i]
+        return self.fields[i] 
  
      def __getitem__( self, key ):
          if type(key) == int: return self._getindex( key )
  
      def __getitem__( self, key ):
          if type(key) == int: return self._getindex( key )
@@ -177,7 +259,8 @@ cdef class TupleProxy:
              return
  
          # conversion with error checking
              return
  
          # conversion with error checking
-        cdef char * tmp = PyString_AsString( value )
+        value = _force_bytes(value)
+        cdef char * tmp = <char*>value
          self.fields[idx] = <char*>malloc( (strlen( tmp ) + 1) * sizeof(char) )
          if self.fields[idx] == NULL:
              raise ValueError("out of memory" )
          self.fields[idx] = <char*>malloc( (strlen( tmp ) + 1) * sizeof(char) )
          if self.fields[idx] == NULL:
              raise ValueError("out of memory" )
@@ -213,7 +296,10 @@ cdef class TupleProxy:
          # copy and replace \0 bytes with \t characters
          if self.is_modified:
              # todo: treat NULL values
          # copy and replace \0 bytes with \t characters
          if self.is_modified:
              # todo: treat NULL values
-            return "\t".join( [StrOrEmpty( self.fields[x]) for x in xrange(0, self.nfields ) ] )
+            result = []
+            for x in xrange( 0, self.nfields ):
+                result.append( StrOrEmpty( self.fields[x]).decode('ascii') )
+            return "\t".join( result )
          else:
              cpy = <char*>calloc( sizeof(char), self.nbytes+1 )
              if cpy == NULL:
          else:
              cpy = <char*>calloc( sizeof(char), self.nbytes+1 )
              if cpy == NULL:
@@ -221,9 +307,9 @@ cdef class TupleProxy:
              memcpy( cpy, self.data, self.nbytes+1)
              for x from 0 <= x < self.nbytes:
                  if cpy[x] == '\0': cpy[x] = '\t'
              memcpy( cpy, self.data, self.nbytes+1)
              for x from 0 <= x < self.nbytes:
                  if cpy[x] == '\0': cpy[x] = '\t'
-            result = PyString_FromStringAndSize(cpy, self.nbytes)
+            result = cpy[:self.nbytes]
              free(cpy)
              free(cpy)
-            return result
+            return result.decode('ascii')
  
  def toDot( v ):
      '''convert value to '.' if None'''
  
  def toDot( v ):
      '''convert value to '.' if None'''
@@ -248,114 +334,88 @@ cdef class GTFProxy( TupleProxy ):
  
      The only exception is the attributes field when set from
      a dictionary - this field will manage its own memory.
  
      The only exception is the attributes field when set from
      a dictionary - this field will manage its own memory.
-
      '''
  
      def __cinit__(self ): 
          # automatically calls TupleProxy.__cinit__
          self.hasOwnAttributes = False
      '''
  
      def __cinit__(self ): 
          # automatically calls TupleProxy.__cinit__
          self.hasOwnAttributes = False
+        self._attributes = NULL
  
      def __dealloc__(self):
          # automatically calls TupleProxy.__dealloc__
          if self.hasOwnAttributes:
  
      def __dealloc__(self):
          # automatically calls TupleProxy.__dealloc__
          if self.hasOwnAttributes:
-            free(self.attributes)
+            free(self._attributes)
  
      cdef int getMaxFields( self, size_t nbytes ):
          '''return max number of fields.'''
          return 9
  
  
      cdef int getMaxFields( self, size_t nbytes ):
          '''return max number of fields.'''
          return 9
  
-    cdef update( self, char * buffer, size_t nbytes ):
-        '''update internal data.
-
-        nbytes does not include the terminal '\0'.
-        '''
-        cdef int end
-        cdef char * cstart, * cend, * cscore
-        self.contig = buffer
-        cdef char * pos
-
-        if buffer[nbytes] != 0:
-            raise ValueError( "incomplete line at %s" % buffer )
-        
-        self.source = pos = nextItem( buffer )
-        self.feature = pos = nextItem( pos )
-        cstart = pos = nextItem( pos )
-        cend = pos = nextItem( pos )
-        self.score = pos = nextItem( pos )
-        self.strand = pos = nextItem( pos )
-        self.frame = pos = nextItem( pos )
-        self.attributes = pos = nextItem( pos )
-
-        self.start = atoi( cstart ) - 1
-        self.end = atoi( cend )
-        self.nfields = 9
-       
      property contig:
         '''contig of feature.'''
      property contig:
         '''contig of feature.'''
-       def __get__( self ): return self.contig
-       def __set__( self, value ): 
-           self.is_modified = True
-           self.contig = value
-
-    property feature:
-       '''feature name.'''
-       def __get__( self ): return self.feature
-       def __set__( self, value ): 
-           self.is_modified = True
-           self.feature = value
+       def __get__( self ): return self._getindex( 0 )
+       def __set__( self, value ): self._setindex( 0, value )
  
      property source:
         '''feature source.'''
  
      property source:
         '''feature source.'''
-       def __get__( self ): return self.source
-       def __set__( self, value ): 
-           self.is_modified = True
-           self.source = value
+       def __get__( self ): return self._getindex( 1 )
+       def __set__( self, value ): self._setindex( 1, value )
+
+    property feature:
+       '''feature name.'''
+       def __get__( self ): return self._getindex( 2 )
+       def __set__( self, value ): self._setindex( 2, value )
  
      property start:
         '''feature start (in 0-based open/closed coordinates).'''
  
      property start:
         '''feature start (in 0-based open/closed coordinates).'''
-       def __get__( self ): return self.start
-       def __set__( self, value ): 
-           self.is_modified = True
-           self.start = value
+       def __get__( self ): return int( self._getindex( 3 )) - 1
+       def __set__( self, value ): self._setindex( 3, str(value+1) )
  
      property end:
         '''feature end (in 0-based open/closed coordinates).'''
  
      property end:
         '''feature end (in 0-based open/closed coordinates).'''
-       def __get__( self ): return self.end
-       def __set__( self, value ): 
-           self.is_modified = True
-           self.end = value
+       def __get__( self ): return int( self._getindex( 4 ) )
+       def __set__( self, value ): self._setindex( 4, str(value) )
  
      property score:
         '''feature score.'''
         def __get__( self ): 
  
      property score:
         '''feature score.'''
         def __get__( self ): 
-           if self.score[0] == '.' and self.score[1] == '\0' :
+           v = self._getindex(5)
+           if v == "" or v[0] == '.':
                 return None
             else:
                 return None
             else:
-               return atof(self.score)
-       def __set__( self, value ): 
-           self.is_modified = True
-           self.score = value
+               return float(v)
+
+       def __set__( self, value ): self._setindex( 5, value )
  
      property strand:
         '''feature strand.'''
  
      property strand:
         '''feature strand.'''
-       def __get__( self ): return self.strand
-       def __set__( self, value ): 
-           self.is_modified = True
-           self.strand = value
+       def __get__( self ): return self._getindex( 6 )
+       def __set__( self, value ): self._setindex( 6, value )
  
      property frame:
         '''feature frame.'''
  
      property frame:
         '''feature frame.'''
-       def __get__( self ): return self.frame
-       def __set__( self, value ): 
-           self.is_modified = True
-           self.frame = value
+       def __get__( self ): return self._getindex( 7 )
+       def __set__( self, value ): self._setindex( 7, value )
  
      property attributes:
         '''feature attributes (as a string).'''
  
      property attributes:
         '''feature attributes (as a string).'''
-       def __get__( self ): return self.attributes
+       def __get__( self ): 
+           if self.hasOwnAttributes:
+               return self._attributes
+           else:
+               return self._getindex( 8 )
         def __set__( self, value ): 
         def __set__( self, value ): 
-           self.is_modified = True
-           self.attributes = value
+           if self.hasOwnAttributes:
+               free(self._attributes)
+               self._attributes = NULL
+               self.hasOwnAttributes = False
+           self._setindex(8, value )
+
+    cdef char * getAttributes( self ):
+       '''return pointer to attributes.'''
+       if self.hasOwnAttributes:
+           return self._attributes
+       else:
+           return self.fields[ 8 ]
  
      def asDict( self ):
          """parse attributes - return as dict
  
      def asDict( self ):
          """parse attributes - return as dict
@@ -398,12 +458,12 @@ cdef class GTFProxy( TupleProxy ):
          cdef int l
  
          # clean up if this field is set twice
          cdef int l
  
          # clean up if this field is set twice
-        if self.hasOwnAttributes:
-            free(self.attributes)
+        if self.hasOwnAttributes: 
+            free(self._attributes)
  
          aa = []
          for k,v in d.items():
  
          aa = []
          for k,v in d.items():
-            if type(v) == types.StringType:
+            if type(v) in types.StringTypes:
                  aa.append( '%s "%s"' % (k,v) )
              else:
                  aa.append( '%s %s' % (k,str(v)) )
                  aa.append( '%s "%s"' % (k,v) )
              else:
                  aa.append( '%s %s' % (k,str(v)) )
@@ -411,10 +471,10 @@ cdef class GTFProxy( TupleProxy ):
          a = "; ".join( aa ) + ";"
          p = a
          l = len(a)
          a = "; ".join( aa ) + ";"
          p = a
          l = len(a)
-        self.attributes = <char *>calloc( l + 1, sizeof(char) )
-        if self.attributes == NULL:
+        self._attributes = <char *>calloc( l + 1, sizeof(char) )
+        if self._attributes == NULL:
              raise ValueError("out of memory" )
              raise ValueError("out of memory" )
-        memcpy( self.attributes, p, l )
+        memcpy( self._attributes, p, l )
  
          self.hasOwnAttributes = True
          self.is_modified = True
  
          self.hasOwnAttributes = True
          self.is_modified = True
@@ -461,28 +521,46 @@ cdef class GTFProxy( TupleProxy ):
          Only called if there *isn't* an attribute with this name
          """
          cdef char * start
          Only called if there *isn't* an attribute with this name
          """
          cdef char * start
-        cdef char * query 
+        cdef char * query
          cdef char * cpy
          cdef char * end
          cdef int l
          cdef char * cpy
          cdef char * end
          cdef int l
-        query = item
-        
-        start = strstr( self.attributes, query)
+
+        #
+        # important to use the getAttributes function.
+        # Using the self.attributes property to access
+        # the attributes caused a hard-to-trace bug
+        # in which fields in the attribute string were
+        # set to 0.
+        # Running through valgrind complained that
+        # memory was accessed in the memory field
+        # that has been released. It is not clear
+        # why this happened and might be a cython bug
+        # (Version 0.16). The valgrind warnings
+        # disappeard after accessing the C data structures
+        # directly and so did the bug.
+        cdef char * attributes = self.getAttributes()
+
+        r = _force_bytes(item)
+        query = r
+        start = strstr( attributes, query)
+
          if start == NULL:
              raise AttributeError("'GTFProxy' has no attribute '%s'" % item )
  
          start += strlen(query) + 1
          # skip gaps before
          while start[0] == ' ': start += 1
          if start == NULL:
              raise AttributeError("'GTFProxy' has no attribute '%s'" % item )
  
          start += strlen(query) + 1
          # skip gaps before
          while start[0] == ' ': start += 1
+
          if start[0] == '"':
              start += 1
              end = start
              while end[0] != '\0' and end[0] != '"': end += 1
              l = end - start
          if start[0] == '"':
              start += 1
              end = start
              while end[0] != '\0' and end[0] != '"': end += 1
              l = end - start
-            result = PyString_FromStringAndSize( start, l )
+            result = _force_str( PyBytes_FromStringAndSize( start, l ) )
              return result
          else:
              return result
          else:
-            return start
+            return _force_str( start )
  
      def setAttribute( self, name, value ):
          '''convenience method to set an attribute.'''
  
      def setAttribute( self, name, value ):
          '''convenience method to set an attribute.'''
@@ -515,18 +593,18 @@ cdef class BedProxy( NamedTupleProxy ):
      This class represents a GTF entry for fast read-access.
      '''
      map_key2field = { 
      This class represents a GTF entry for fast read-access.
      '''
      map_key2field = { 
-        'contig' : (0, str),
+        'contig' : (0, bytes),
          'start' : (1, int),
          'end' : (2, int),
          'start' : (1, int),
          'end' : (2, int),
-        'name' : (3, str),
+        'name' : (3, bytes),
          'score' : (4, float),
          'score' : (4, float),
-        'strand' : (5, str),
-        'thickStart' : (6,int ),
-        'thickEnd' : (7,int),
-        'itemRGB' : (8,str),
-        'blockCount': (9,int),
-        'blockSizes': (10,str),
-        'blockStarts': (11,str), } 
+        'strand' : (5, bytes),
+        'thickStart' : (6, int ),
+        'thickEnd' : (7, int),
+        'itemRGB' : (8, bytes),
+        'blockCount': (9, int),
+        'blockSizes': (10, bytes),
+        'blockStarts': (11, bytes), } 
  
      cdef int getMaxFields( self, size_t nbytes ):
          '''return max number of fields.'''
  
      cdef int getMaxFields( self, size_t nbytes ):
          '''return max number of fields.'''
@@ -581,15 +659,15 @@ cdef class VCFProxy( NamedTupleProxy ):
      The genotypes are accessed via index.
      '''
      map_key2field = { 
      The genotypes are accessed via index.
      '''
      map_key2field = { 
-        'contig' : (0, str),
+        'contig' : (0, bytes),
          'pos' : (1, int),
          'pos' : (1, int),
-        'id' : (2, str),
-        'ref' : (3, str),
-        'alt' : (4, str),
-        'qual' : (5, str),
-        'filter' : (6,str),
-        'info' : (7,str),
-        'format' : (8,str) }
+        'id' : (2, bytes),
+        'ref' : (3, bytes),
+        'alt' : (4, bytes),
+        'qual' : (5, bytes),
+        'filter' : (6, bytes),
+        'info' : (7, bytes),
+        'format' : (8, bytes) }
  
      def __cinit__(self ): 
          # automatically calls TupleProxy.__cinit__
  
      def __cinit__(self ): 
          # automatically calls TupleProxy.__cinit__
@@ -608,6 +686,7 @@ cdef class VCFProxy( NamedTupleProxy ):
          self.pos = atoi( self.fields[1] ) - 1
                               
      def __len__(self):
          self.pos = atoi( self.fields[1] ) - 1
                               
      def __len__(self):
+        '''return number of genotype fields.'''
          return max(0, self.nfields - 9)
  
      property pos:
          return max(0, self.nfields - 9)
  
      property pos: