3 from cpython.version cimport PY_MAJOR_VERSION
5 from cpython cimport PyErr_SetString, PyBytes_Check, PyUnicode_Check, PyBytes_FromStringAndSize
7 cdef from_string_and_size(char* s, size_t length):
8 if PY_MAJOR_VERSION < 3:
11 return s[:length].decode("ascii")
13 # filename encoding (copied from lxml.etree.pyx)
14 cdef str _FILENAME_ENCODING
15 _FILENAME_ENCODING = sys.getfilesystemencoding()
16 if _FILENAME_ENCODING is None:
17 _FILENAME_ENCODING = sys.getdefaultencoding()
18 if _FILENAME_ENCODING is None:
19 _FILENAME_ENCODING = 'ascii'
21 cdef bytes _my_encodeFilename(object filename):
22 u"""Make sure a filename is 8-bit encoded (or None).
26 elif PyBytes_Check(filename):
28 elif PyUnicode_Check(filename):
29 return filename.encode(_FILENAME_ENCODING)
31 raise TypeError, u"Argument must be string or unicode."
33 cdef bytes _force_bytes(object s):
34 u"""convert string or unicode object to bytes, assuming ascii encoding.
36 if PY_MAJOR_VERSION < 3:
40 elif PyBytes_Check(s):
42 elif PyUnicode_Check(s):
43 return s.encode('ascii')
45 raise TypeError, u"Argument must be string, bytes or unicode."
47 cdef inline bytes _force_cmdline_bytes(object s):
48 return _force_bytes(s)
50 cdef _charptr_to_str(char* s):
51 if PY_MAJOR_VERSION < 3:
54 return s.decode("ascii")
56 cdef _force_str(object s):
57 """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)"""
60 if PY_MAJOR_VERSION < 3:
62 elif PyBytes_Check(s):
63 return s.decode('ascii')
68 cdef char * nextItem( char * buffer ):
70 pos = strchr( buffer, '\t' )
71 if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
76 cdef char *StrOrEmpty( char * buffer ):
77 if buffer == NULL: return ""
80 cdef int isNew( char * p, char * buffer, size_t nbytes ):
81 if p == NULL: return 0
82 return not (buffer <= p < buffer + nbytes )
84 cdef class TupleProxy:
85 '''Proxy class for access to parsed row as a tuple.
87 This class represents a table row for fast read-access.
89 Access to individual fields is via the [] operator.
91 Only read-only access is implemented.
102 # start counting at field offset
105 def __dealloc__(self):
108 for x from 0 <= x < self.nfields:
109 if isNew( self.fields[x], self.data, self.nbytes ):
110 free( self.fields[x] )
111 self.fields[x] = NULL
113 if self.data != NULL: free(self.data)
114 if self.fields != NULL: free( self.fields )
116 cdef take( self, char * buffer, size_t nbytes ):
117 '''start presenting buffer.
119 Take ownership of the pointer.
123 self.update( buffer, nbytes )
125 cdef present( self, char * buffer, size_t nbytes ):
126 '''start presenting buffer.
128 Do not take ownership of the pointer.
130 self.update( buffer, nbytes )
132 cdef copy( self, char * buffer, size_t nbytes ):
133 '''start presenting buffer of size *nbytes*.
135 Buffer is a '\0'-terminated string without the '\n'.
137 Take a copy of buffer.
141 s = sizeof(char) * (nbytes + 1)
142 self.data = <char*>malloc( s )
143 if self.data == NULL:
144 raise ValueError("out of memory" )
146 memcpy( <char*>self.data, buffer, s )
147 self.update( self.data, nbytes )
149 cdef int getMaxFields( self, size_t nbytes ):
150 '''initialize fields.'''
153 cdef update( self, char * buffer, size_t nbytes ):
154 '''update internal data.
156 *buffer* is a \0 terminated string.
158 *nbytes* is the number of bytes in buffer (excluding
161 Update starts work in buffer, thus can be used
162 to collect any number of fields until nbytes
165 If max_fields is set, the number of fields is initialized to
171 cdef int max_fields, x
173 assert strlen(buffer) == nbytes
175 if buffer[nbytes] != 0:
176 raise ValueError( "incomplete line at %s" % buffer )
178 #################################
179 # remove line breaks and feeds and update number of bytes
181 while x > 0 and (buffer[x] == '\n' or buffer[x] == '\r'):
186 #################################
188 if self.fields != NULL: free(self.fields)
190 for field from 0 <= field < self.nfields:
191 if isNew( self.fields[field], self.data, self.nbytes ):
192 free( self.fields[field] )
194 self.is_modified = self.nfields = 0
196 #################################
198 max_fields = self.getMaxFields( nbytes )
199 self.fields = <char **>calloc( max_fields, sizeof(char *) )
200 if self.fields == NULL:
201 raise ValueError("out of memory" )
203 #################################
206 self.fields[field] = pos = buffer
212 pos = <char*>memchr( pos, '\t', nbytes )
213 if pos == NULL: break
216 self.fields[field] = pos
218 if field > max_fields:
219 raise ValueError("row too large - more than %i fields" % max_fields )
220 nbytes -= pos - old_pos
226 def _getindex( self, int index ):
227 '''return item at idx index'''
229 if i < 0: i += self.nfields
230 if i < 0: raise IndexError( "list index out of range" )
232 if i >= self.nfields:
233 raise IndexError( "list index out of range %i >= %i" % (i, self.nfields ))
234 return self.fields[i]
236 def __getitem__( self, key ):
237 if type(key) == int: return self._getindex( key )
239 start, end, step = key.indices( self.nfields )
241 for index in range( start, end, step ):
242 result.append( self._getindex( index ) )
245 def _setindex( self, index, value ):
246 '''set item at idx index.'''
248 if idx < 0: raise IndexError( "list index out of range" )
249 if idx >= self.nfields:
250 raise IndexError( "list index out of range" )
252 if isNew( self.fields[idx], self.data, self.nbytes ):
253 free( self.fields[idx] )
258 self.fields[idx] = NULL
261 # conversion with error checking
262 value = _force_bytes(value)
263 cdef char * tmp = <char*>value
264 self.fields[idx] = <char*>malloc( (strlen( tmp ) + 1) * sizeof(char) )
265 if self.fields[idx] == NULL:
266 raise ValueError("out of memory" )
267 strcpy( self.fields[idx], tmp )
269 def __setitem__(self, index, value ):
270 '''set item at *index* to *value*'''
272 if i < 0: i += self.nfields
275 self._setindex( i, value )
285 """python version of next().
287 if self.index >= self.nfields:
289 cdef char * retval = self.fields[self.index]
291 if retval == NULL: return None
295 '''return original data'''
296 # copy and replace \0 bytes with \t characters
298 # todo: treat NULL values
300 for x in xrange( 0, self.nfields ):
301 result.append( StrOrEmpty( self.fields[x]).decode('ascii') )
302 return "\t".join( result )
304 cpy = <char*>calloc( sizeof(char), self.nbytes+1 )
306 raise ValueError("out of memory" )
307 memcpy( cpy, self.data, self.nbytes+1)
308 for x from 0 <= x < self.nbytes:
309 if cpy[x] == '\0': cpy[x] = '\t'
310 result = cpy[:self.nbytes]
312 return result.decode('ascii')
315 '''convert value to '.' if None'''
316 if v == None: return "."
320 '''return a quoted attribute.'''
321 if type(v) in types.StringTypes:
326 cdef class GTFProxy( TupleProxy ):
327 '''Proxy class for access to GTF fields.
329 This class represents a GTF entry for fast read-access.
330 Write-access has been added as well, though some care must
331 be taken. If any of the string fields (contig, source, ...)
332 are set, the new value is tied to the lifetime of the
333 argument that was supplied.
335 The only exception is the attributes field when set from
336 a dictionary - this field will manage its own memory.
339 def __cinit__(self ):
340 # automatically calls TupleProxy.__cinit__
341 self.hasOwnAttributes = False
342 self._attributes = NULL
344 def __dealloc__(self):
345 # automatically calls TupleProxy.__dealloc__
346 if self.hasOwnAttributes:
347 free(self._attributes)
349 cdef int getMaxFields( self, size_t nbytes ):
350 '''return max number of fields.'''
354 '''contig of feature.'''
355 def __get__( self ): return self._getindex( 0 )
356 def __set__( self, value ): self._setindex( 0, value )
359 '''feature source.'''
360 def __get__( self ): return self._getindex( 1 )
361 def __set__( self, value ): self._setindex( 1, value )
365 def __get__( self ): return self._getindex( 2 )
366 def __set__( self, value ): self._setindex( 2, value )
369 '''feature start (in 0-based open/closed coordinates).'''
370 def __get__( self ): return int( self._getindex( 3 )) - 1
371 def __set__( self, value ): self._setindex( 3, str(value+1) )
374 '''feature end (in 0-based open/closed coordinates).'''
375 def __get__( self ): return int( self._getindex( 4 ) )
376 def __set__( self, value ): self._setindex( 4, str(value) )
381 v = self._getindex(5)
382 if v == "" or v[0] == '.':
387 def __set__( self, value ): self._setindex( 5, value )
390 '''feature strand.'''
391 def __get__( self ): return self._getindex( 6 )
392 def __set__( self, value ): self._setindex( 6, value )
396 def __get__( self ): return self._getindex( 7 )
397 def __set__( self, value ): self._setindex( 7, value )
400 '''feature attributes (as a string).'''
402 if self.hasOwnAttributes:
403 return self._attributes
405 return self._getindex( 8 )
406 def __set__( self, value ):
407 if self.hasOwnAttributes:
408 free(self._attributes)
409 self._attributes = NULL
410 self.hasOwnAttributes = False
411 self._setindex(8, value )
413 cdef char * getAttributes( self ):
414 '''return pointer to attributes.'''
415 if self.hasOwnAttributes:
416 return self._attributes
418 return self.fields[ 8 ]
421 """parse attributes - return as dict
425 attributes = self.attributes
427 # separate into fields
428 fields = [ x.strip() for x in attributes.split(";")[:-1]]
434 d = [ x.strip() for x in f.split(" ")]
437 if len(d) > 2: v = d[1:]
439 if v[0] == '"' and v[-1] == '"':
442 ## try to convert to a value
455 def fromDict( self, d ):
456 '''set attributes from a dictionary.'''
460 # clean up if this field is set twice
461 if self.hasOwnAttributes:
462 free(self._attributes)
465 for k,v in d.items():
466 if type(v) in types.StringTypes:
467 aa.append( '%s "%s"' % (k,v) )
469 aa.append( '%s %s' % (k,str(v)) )
471 a = "; ".join( aa ) + ";"
474 self._attributes = <char *>calloc( l + 1, sizeof(char) )
475 if self._attributes == NULL:
476 raise ValueError("out of memory" )
477 memcpy( self._attributes, p, l )
479 self.hasOwnAttributes = True
480 self.is_modified = True
498 return TupleProxy.__str__(self)
500 def invert( self, int lcontig ):
501 '''invert coordinates to negative strand coordinates
503 This method will only act if the feature is on the
506 if self.strand[0] == '-':
507 start = min(self.start, self.end)
508 end = max(self.start, self.end)
509 self.start, self.end = lcontig - end, lcontig - start
512 '''return a list of attributes defined in this entry.'''
514 return [ x.strip().split(" ")[0] for x in r.split(";") if x.strip() != '' ]
516 def __getitem__(self, key):
517 return self.__getattr__( key )
519 def __getattr__(self, item ):
520 """Generic lookup of attribute from GFF/GTF attributes
521 Only called if there *isn't* an attribute with this name
530 # important to use the getAttributes function.
531 # Using the self.attributes property to access
532 # the attributes caused a hard-to-trace bug
533 # in which fields in the attribute string were
535 # Running through valgrind complained that
536 # memory was accessed in the memory field
537 # that has been released. It is not clear
538 # why this happened and might be a cython bug
539 # (Version 0.16). The valgrind warnings
540 # disappeard after accessing the C data structures
541 # directly and so did the bug.
542 cdef char * attributes = self.getAttributes()
544 r = _force_bytes(item)
546 start = strstr( attributes, query)
549 raise AttributeError("'GTFProxy' has no attribute '%s'" % item )
551 start += strlen(query) + 1
553 while start[0] == ' ': start += 1
558 while end[0] != '\0' and end[0] != '"': end += 1
560 result = _force_str( PyBytes_FromStringAndSize( start, l ) )
563 return _force_str( start )
565 def setAttribute( self, name, value ):
566 '''convenience method to set an attribute.'''
571 cdef class NamedTupleProxy( TupleProxy ):
575 def __setattr__(self, key, value ):
578 idx, f = self.map_key2field[key]
579 if self.nfields < idx:
580 raise KeyError( "field %s not set" % key )
581 TupleProxy.__setitem__(self, idx, str(value) )
583 def __getattr__(self, key ):
585 idx, f = self.map_key2field[key]
586 if self.nfields < idx:
587 raise KeyError( "field %s not set" % key )
588 return f( self.fields[idx] )
590 cdef class BedProxy( NamedTupleProxy ):
591 '''Proxy class for access to Bed fields.
593 This class represents a GTF entry for fast read-access.
596 'contig' : (0, bytes),
600 'score' : (4, float),
601 'strand' : (5, bytes),
602 'thickStart' : (6, int ),
603 'thickEnd' : (7, int),
604 'itemRGB' : (8, bytes),
605 'blockCount': (9, int),
606 'blockSizes': (10, bytes),
607 'blockStarts': (11, bytes), }
609 cdef int getMaxFields( self, size_t nbytes ):
610 '''return max number of fields.'''
613 cdef update( self, char * buffer, size_t nbytes ):
614 '''update internal data.
616 nbytes does not include the terminal '\0'.
618 TupleProxy.update( self, buffer, nbytes )
621 raise ValueError( "bed format requires at least three columns" )
623 # determines bed format
624 self.bedfields = self.nfields
626 # do automatic conversion
627 self.contig = self.fields[0]
628 self.start = atoi( self.fields[1] )
629 self.end = atoi( self.fields[2] )
631 # __setattr__ in base class seems to take precedence
632 # hence implement setters in __setattr__
634 # def __get__( self ): return self.start
636 # def __get__( self ): return self.end
640 cdef int save_fields = self.nfields
641 # ensure fields to use correct format
642 self.nfields = self.bedfields
643 retval = TupleProxy.__str__( self )
644 self.nfields = save_fields
647 def __setattr__(self, key, value ):
649 if key == "start": self.start = value
650 elif key == "end": self.end = value
653 idx, f = self.map_key2field[key]
654 TupleProxy._setindex(self, idx, str(value) )
656 cdef class VCFProxy( NamedTupleProxy ):
657 '''Proxy class for access to VCF fields.
659 The genotypes are accessed via index.
662 'contig' : (0, bytes),
668 'filter' : (6, bytes),
670 'format' : (8, bytes) }
672 def __cinit__(self ):
673 # automatically calls TupleProxy.__cinit__
674 # start indexed access at genotypes
677 cdef update( self, char * buffer, size_t nbytes ):
678 '''update internal data.
680 nbytes does not include the terminal '\0'.
682 TupleProxy.update( self, buffer, nbytes )
684 self.contig = self.fields[0]
685 # vcf counts from 1 - correct here
686 self.pos = atoi( self.fields[1] ) - 1
689 '''return number of genotype fields.'''
690 return max(0, self.nfields - 9)
693 '''feature end (in 0-based open/closed coordinates).'''
697 def __setattr__(self, key, value ):
704 idx, f = self.map_key2field[key]
705 TupleProxy._setindex(self, idx, str(value) )