2 from cpython cimport PyString_FromStringAndSize, PyString_AsString, PyString_AS_STRING
4 cdef char * nextItem( char * buffer ):
6 pos = strchr( buffer, '\t' )
7 if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
12 cdef char *StrOrEmpty( char * buffer ):
13 if buffer == NULL: return ""
16 cdef int isNew( char * p, char * buffer, size_t nbytes ):
17 if p == NULL: return 0
18 return not (buffer <= p < buffer + nbytes )
20 cdef class TupleProxy:
21 '''Proxy class for access to parsed row as a tuple.
23 This class represents a table row for fast read-access.
25 Access to individual fields is via the [] operator.
27 Only read-only access is implemented.
37 # start counting at field offset
40 def __dealloc__(self):
43 for x from 0 <= x < self.nfields:
44 if isNew( self.fields[x], self.data, self.nbytes ):
45 free( self.fields[x] )
48 if self.data != NULL: free(self.data)
49 if self.fields != NULL: free( self.fields )
51 cdef take( self, char * buffer, size_t nbytes ):
52 '''start presenting buffer.
54 Take ownership of the pointer.
58 self.update( buffer, nbytes )
60 cdef present( self, char * buffer, size_t nbytes ):
61 '''start presenting buffer.
63 Do not take ownership of the pointer.
65 self.update( buffer, nbytes )
67 cdef copy( self, char * buffer, size_t nbytes ):
68 '''start presenting buffer.
70 Take a copy of buffer.
74 s = sizeof(char) * (nbytes + 1)
75 self.data = <char*>malloc( s )
77 raise ValueError("out of memory" )
79 memcpy( <char*>self.data, buffer, s )
80 self.update( self.data, nbytes )
82 cdef int getMaxFields( self, size_t nbytes ):
83 '''initialize fields.'''
86 cdef update( self, char * buffer, size_t nbytes ):
87 '''update internal data.
89 Update starts work in buffer, thus can be used
90 to collect any number of fields until nbytes
93 If max_fields is set, the number of fields is initialized to max_fields.
99 cdef int max_fields, x
101 if buffer[nbytes] != 0:
102 raise ValueError( "incomplete line at %s" % buffer )
104 #################################
106 if self.fields != NULL: free(self.fields)
108 for field from 0 <= field < self.nfields:
109 if isNew( self.fields[field], self.data, self.nbytes ):
110 free( self.fields[field] )
112 self.is_modified = self.nfields = 0
114 #################################
116 max_fields = self.getMaxFields( nbytes )
117 self.fields = <char **>calloc( max_fields, sizeof(char *) )
118 if self.fields == NULL:
119 raise ValueError("out of memory" )
121 #################################
124 self.fields[field] = pos = buffer
130 pos = <char*>memchr( pos, '\t', nbytes )
131 if pos == NULL: break
134 self.fields[field] = pos
136 if field >= max_fields:
137 raise ValueError("row too large - more than %i fields" % max_fields )
138 nbytes -= pos - old_pos
144 def _getindex( self, int index ):
145 '''return item at idx index'''
147 if i < 0: i += self.nfields
148 if i < 0: raise IndexError( "list index out of range" )
150 if i >= self.nfields:
151 raise IndexError( "list index out of range %i >= %i" % (i, self.nfields ))
152 return self.fields[i]
154 def __getitem__( self, key ):
155 if type(key) == int: return self._getindex( key )
157 start, end, step = key.indices( self.nfields )
159 for index in range( start, end, step ):
160 result.append( self._getindex( index ) )
163 def _setindex( self, index, value ):
164 '''set item at idx index.'''
166 if idx < 0: raise IndexError( "list index out of range" )
167 if idx >= self.nfields:
168 raise IndexError( "list index out of range" )
170 if isNew( self.fields[idx], self.data, self.nbytes ):
171 free( self.fields[idx] )
176 self.fields[idx] = NULL
179 # conversion with error checking
180 cdef char * tmp = PyString_AsString( value )
181 self.fields[idx] = <char*>malloc( (strlen( tmp ) + 1) * sizeof(char) )
182 if self.fields[idx] == NULL:
183 raise ValueError("out of memory" )
184 strcpy( self.fields[idx], tmp )
186 def __setitem__(self, index, value ):
187 '''set item at *index* to *value*'''
189 if i < 0: i += self.nfields
192 self._setindex( i, value )
202 """python version of next().
204 if self.index >= self.nfields:
206 cdef char * retval = self.fields[self.index]
208 if retval == NULL: return None
212 '''return original data'''
213 # copy and replace \0 bytes with \t characters
215 # todo: treat NULL values
216 return "\t".join( [StrOrEmpty( self.fields[x]) for x in xrange(0, self.nfields ) ] )
218 cpy = <char*>calloc( sizeof(char), self.nbytes+1 )
220 raise ValueError("out of memory" )
221 memcpy( cpy, self.data, self.nbytes+1)
222 for x from 0 <= x < self.nbytes:
223 if cpy[x] == '\0': cpy[x] = '\t'
224 result = PyString_FromStringAndSize(cpy, self.nbytes)
229 '''convert value to '.' if None'''
230 if v == None: return "."
234 '''return a quoted attribute.'''
235 if type(v) in types.StringTypes:
240 cdef class GTFProxy( TupleProxy ):
241 '''Proxy class for access to GTF fields.
243 This class represents a GTF entry for fast read-access.
244 Write-access has been added as well, though some care must
245 be taken. If any of the string fields (contig, source, ...)
246 are set, the new value is tied to the lifetime of the
247 argument that was supplied.
249 The only exception is the attributes field when set from
250 a dictionary - this field will manage its own memory.
254 def __cinit__(self ):
255 # automatically calls TupleProxy.__cinit__
256 self.hasOwnAttributes = False
258 def __dealloc__(self):
259 # automatically calls TupleProxy.__dealloc__
260 if self.hasOwnAttributes:
261 free(self.attributes)
263 cdef int getMaxFields( self, size_t nbytes ):
264 '''return max number of fields.'''
267 cdef update( self, char * buffer, size_t nbytes ):
268 '''update internal data.
270 nbytes does not include the terminal '\0'.
273 cdef char * cstart, * cend, * cscore
277 if buffer[nbytes] != 0:
278 raise ValueError( "incomplete line at %s" % buffer )
280 self.source = pos = nextItem( buffer )
281 self.feature = pos = nextItem( pos )
282 cstart = pos = nextItem( pos )
283 cend = pos = nextItem( pos )
284 self.score = pos = nextItem( pos )
285 self.strand = pos = nextItem( pos )
286 self.frame = pos = nextItem( pos )
287 self.attributes = pos = nextItem( pos )
289 self.start = atoi( cstart ) - 1
290 self.end = atoi( cend )
294 '''contig of feature.'''
295 def __get__( self ): return self.contig
296 def __set__( self, value ):
297 self.is_modified = True
302 def __get__( self ): return self.feature
303 def __set__( self, value ):
304 self.is_modified = True
308 '''feature source.'''
309 def __get__( self ): return self.source
310 def __set__( self, value ):
311 self.is_modified = True
315 '''feature start (in 0-based open/closed coordinates).'''
316 def __get__( self ): return self.start
317 def __set__( self, value ):
318 self.is_modified = True
322 '''feature end (in 0-based open/closed coordinates).'''
323 def __get__( self ): return self.end
324 def __set__( self, value ):
325 self.is_modified = True
331 if self.score[0] == '.' and self.score[1] == '\0' :
334 return atof(self.score)
335 def __set__( self, value ):
336 self.is_modified = True
340 '''feature strand.'''
341 def __get__( self ): return self.strand
342 def __set__( self, value ):
343 self.is_modified = True
348 def __get__( self ): return self.frame
349 def __set__( self, value ):
350 self.is_modified = True
354 '''feature attributes (as a string).'''
355 def __get__( self ): return self.attributes
356 def __set__( self, value ):
357 self.is_modified = True
358 self.attributes = value
361 """parse attributes - return as dict
365 attributes = self.attributes
367 # separate into fields
368 fields = [ x.strip() for x in attributes.split(";")[:-1]]
374 d = [ x.strip() for x in f.split(" ")]
377 if len(d) > 2: v = d[1:]
379 if v[0] == '"' and v[-1] == '"':
382 ## try to convert to a value
395 def fromDict( self, d ):
396 '''set attributes from a dictionary.'''
400 # clean up if this field is set twice
401 if self.hasOwnAttributes:
402 free(self.attributes)
405 for k,v in d.items():
406 if type(v) == types.StringType:
407 aa.append( '%s "%s"' % (k,v) )
409 aa.append( '%s %s' % (k,str(v)) )
411 a = "; ".join( aa ) + ";"
414 self.attributes = <char *>calloc( l + 1, sizeof(char) )
415 if self.attributes == NULL:
416 raise ValueError("out of memory" )
417 memcpy( self.attributes, p, l )
419 self.hasOwnAttributes = True
420 self.is_modified = True
438 return TupleProxy.__str__(self)
440 def invert( self, int lcontig ):
441 '''invert coordinates to negative strand coordinates
443 This method will only act if the feature is on the
446 if self.strand[0] == '-':
447 start = min(self.start, self.end)
448 end = max(self.start, self.end)
449 self.start, self.end = lcontig - end, lcontig - start
452 '''return a list of attributes defined in this entry.'''
454 return [ x.strip().split(" ")[0] for x in r.split(";") if x.strip() != '' ]
456 def __getitem__(self, key):
457 return self.__getattr__( key )
459 def __getattr__(self, item ):
460 """Generic lookup of attribute from GFF/GTF attributes
461 Only called if there *isn't* an attribute with this name
470 start = strstr( self.attributes, query)
472 raise AttributeError("'GTFProxy' has no attribute '%s'" % item )
474 start += strlen(query) + 1
476 while start[0] == ' ': start += 1
480 while end[0] != '\0' and end[0] != '"': end += 1
482 result = PyString_FromStringAndSize( start, l )
487 def setAttribute( self, name, value ):
488 '''convenience method to set an attribute.'''
493 cdef class NamedTupleProxy( TupleProxy ):
497 def __setattr__(self, key, value ):
500 idx, f = self.map_key2field[key]
501 if self.nfields < idx:
502 raise KeyError( "field %s not set" % key )
503 TupleProxy.__setitem__(self, idx, str(value) )
505 def __getattr__(self, key ):
507 idx, f = self.map_key2field[key]
508 if self.nfields < idx:
509 raise KeyError( "field %s not set" % key )
510 return f( self.fields[idx] )
512 cdef class BedProxy( NamedTupleProxy ):
513 '''Proxy class for access to Bed fields.
515 This class represents a GTF entry for fast read-access.
522 'score' : (4, float),
524 'thickStart' : (6,int ),
525 'thickEnd' : (7,int),
527 'blockCount': (9,int),
528 'blockSizes': (10,str),
529 'blockStarts': (11,str), }
531 cdef int getMaxFields( self, size_t nbytes ):
532 '''return max number of fields.'''
535 cdef update( self, char * buffer, size_t nbytes ):
536 '''update internal data.
538 nbytes does not include the terminal '\0'.
540 TupleProxy.update( self, buffer, nbytes )
543 raise ValueError( "bed format requires at least three columns" )
545 # determines bed format
546 self.bedfields = self.nfields
548 # do automatic conversion
549 self.contig = self.fields[0]
550 self.start = atoi( self.fields[1] )
551 self.end = atoi( self.fields[2] )
553 # __setattr__ in base class seems to take precedence
554 # hence implement setters in __setattr__
556 # def __get__( self ): return self.start
558 # def __get__( self ): return self.end
562 cdef int save_fields = self.nfields
563 # ensure fields to use correct format
564 self.nfields = self.bedfields
565 retval = TupleProxy.__str__( self )
566 self.nfields = save_fields
569 def __setattr__(self, key, value ):
571 if key == "start": self.start = value
572 elif key == "end": self.end = value
575 idx, f = self.map_key2field[key]
576 TupleProxy._setindex(self, idx, str(value) )
578 cdef class VCFProxy( NamedTupleProxy ):
579 '''Proxy class for access to VCF fields.
581 The genotypes are accessed via index.
594 def __cinit__(self ):
595 # automatically calls TupleProxy.__cinit__
596 # start indexed access at genotypes
599 cdef update( self, char * buffer, size_t nbytes ):
600 '''update internal data.
602 nbytes does not include the terminal '\0'.
604 TupleProxy.update( self, buffer, nbytes )
606 self.contig = self.fields[0]
607 # vcf counts from 1 - correct here
608 self.pos = atoi( self.fields[1] ) - 1
611 return max(0, self.nfields - 9)
614 '''feature end (in 0-based open/closed coordinates).'''
618 def __setattr__(self, key, value ):
625 idx, f = self.map_key2field[key]
626 TupleProxy._setindex(self, idx, str(value) )