2 from cpython cimport PyString_FromStringAndSize, PyString_AsString, PyString_AS_STRING
4 cdef char * nextItem( char * buffer ):
6 pos = strchr( buffer, '\t' )
7 if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
12 cdef char *StrOrEmpty( char * buffer ):
13 if buffer == NULL: return ""
16 cdef int isNew( char * p, char * buffer, size_t nbytes ):
17 if p == NULL: return 0
18 return not (buffer <= p < buffer + nbytes )
20 cdef class TupleProxy:
21 '''Proxy class for access to parsed row as a tuple.
23 This class represents a table row for fast read-access.
25 Access to individual fields is via the [] operator.
27 Only read-only access is implemented.
37 # start counting at field offset
40 def __dealloc__(self):
43 for x from 0 <= x < self.nfields:
44 if isNew( self.fields[x], self.data, self.nbytes ):
45 free( self.fields[x] )
48 if self.data != NULL: free(self.data)
49 if self.fields != NULL: free( self.fields )
51 cdef take( self, char * buffer, size_t nbytes ):
52 '''start presenting buffer.
54 Take ownership of the pointer.
58 self.update( buffer, nbytes )
60 cdef present( self, char * buffer, size_t nbytes ):
61 '''start presenting buffer.
63 Do not take ownership of the pointer.
65 self.update( buffer, nbytes )
67 cdef copy( self, char * buffer, size_t nbytes ):
68 '''start presenting buffer.
70 Take a copy of buffer.
74 s = sizeof(char) * (nbytes + 1)
75 self.data = <char*>malloc( s )
77 raise ValueError("out of memory" )
79 memcpy( <char*>self.data, buffer, s )
80 self.update( self.data, nbytes )
82 cdef int getMaxFields( self, size_t nbytes ):
83 '''initialize fields.'''
86 cdef update( self, char * buffer, size_t nbytes ):
87 '''update internal data.
89 Update starts work in buffer, thus can be used
90 to collect any number of fields until nbytes
93 If max_fields is set, the number of fields is initialized to max_fields.
99 cdef int max_fields, x
101 if buffer[nbytes] != 0:
102 raise ValueError( "incomplete line at %s" % buffer )
104 #################################
106 if self.fields != NULL: free(self.fields)
108 for field from 0 <= field < self.nfields:
109 if isNew( self.fields[field], self.data, self.nbytes ):
110 free( self.fields[field] )
112 self.is_modified = self.nfields = 0
114 #################################
116 max_fields = self.getMaxFields( nbytes )
117 self.fields = <char **>calloc( max_fields, sizeof(char *) )
118 if self.fields == NULL:
119 raise ValueError("out of memory" )
121 #################################
124 self.fields[field] = pos = buffer
130 pos = <char*>memchr( pos, '\t', nbytes )
131 if pos == NULL: break
134 self.fields[field] = pos
136 if field >= max_fields:
137 raise ValueError("row too large - more than %i fields" % max_fields )
138 nbytes -= pos - old_pos
144 def __getitem__( self, key ):
147 if i < 0: i += self.nfields
148 if i < 0: raise IndexError( "list index out of range" )
150 if i >= self.nfields:
151 raise IndexError( "list index out of range" )
152 return self.fields[i]
154 def _setindex( self, index, value ):
155 '''set item at idx index.'''
157 if idx < 0: raise IndexError( "list index out of range" )
158 if idx >= self.nfields:
159 raise IndexError( "list index out of range" )
161 if isNew( self.fields[idx], self.data, self.nbytes ):
162 free( self.fields[idx] )
167 self.fields[idx] = NULL
170 # conversion with error checking
171 cdef char * tmp = PyString_AsString( value )
172 self.fields[idx] = <char*>malloc( (strlen( tmp ) + 1) * sizeof(char) )
173 if self.fields[idx] == NULL:
174 raise ValueError("out of memory" )
175 strcpy( self.fields[idx], tmp )
177 def __setitem__(self, index, value ):
178 '''set item at *index* to *value*'''
180 if i < 0: i += self.nfields
183 self._setindex( i, value )
193 """python version of next().
195 if self.index >= self.nfields:
197 cdef char * retval = self.fields[self.index]
199 if retval == NULL: return None
203 '''return original data'''
204 # copy and replace \0 bytes with \t characters
206 # todo: treat NULL values
207 return "\t".join( [StrOrEmpty( self.fields[x]) for x in xrange(0, self.nfields ) ] )
209 cpy = <char*>calloc( sizeof(char), self.nbytes+1 )
211 raise ValueError("out of memory" )
212 memcpy( cpy, self.data, self.nbytes+1)
213 for x from 0 <= x < self.nbytes:
214 if cpy[x] == '\0': cpy[x] = '\t'
215 result = PyString_FromStringAndSize(cpy, self.nbytes)
220 '''convert value to '.' if None'''
221 if v == None: return "."
225 '''return a quoted attribute.'''
226 if type(v) in types.StringTypes:
231 cdef class GTFProxy( TupleProxy ):
232 '''Proxy class for access to GTF fields.
234 This class represents a GTF entry for fast read-access.
235 Write-access has been added as well, though some care must
236 be taken. If any of the string fields (contig, source, ...)
237 are set, the new value is tied to the lifetime of the
238 argument that was supplied.
240 The only exception is the attributes field when set from
241 a dictionary - this field will manage its own memory.
245 def __cinit__(self ):
246 # automatically calls TupleProxy.__cinit__
247 self.hasOwnAttributes = False
249 def __dealloc__(self):
250 # automatically calls TupleProxy.__dealloc__
251 if self.hasOwnAttributes:
252 free(self.attributes)
254 cdef int getMaxFields( self, size_t nbytes ):
255 '''return max number of fields.'''
258 cdef update( self, char * buffer, size_t nbytes ):
259 '''update internal data.
261 nbytes does not include the terminal '\0'.
264 cdef char * cstart, * cend, * cscore
268 if buffer[nbytes] != 0:
269 raise ValueError( "incomplete line at %s" % buffer )
271 self.source = pos = nextItem( buffer )
272 self.feature = pos = nextItem( pos )
273 cstart = pos = nextItem( pos )
274 cend = pos = nextItem( pos )
275 self.score = pos = nextItem( pos )
276 self.strand = pos = nextItem( pos )
277 self.frame = pos = nextItem( pos )
278 self.attributes = pos = nextItem( pos )
280 self.start = atoi( cstart ) - 1
281 self.end = atoi( cend )
284 '''contig of feature.'''
285 def __get__( self ): return self.contig
286 def __set__( self, value ):
287 self.is_modified = True
292 def __get__( self ): return self.feature
293 def __set__( self, value ):
294 self.is_modified = True
298 '''feature source.'''
299 def __get__( self ): return self.source
300 def __set__( self, value ):
301 self.is_modified = True
305 '''feature start (in 0-based open/closed coordinates).'''
306 def __get__( self ): return self.start
307 def __set__( self, value ):
308 self.is_modified = True
312 '''feature end (in 0-based open/closed coordinates).'''
313 def __get__( self ): return self.end
314 def __set__( self, value ):
315 self.is_modified = True
321 if self.score[0] == '.' and self.score[1] == '\0' :
324 return atof(self.score)
325 def __set__( self, value ):
326 self.is_modified = True
330 '''feature strand.'''
331 def __get__( self ): return self.strand
332 def __set__( self, value ):
333 self.is_modified = True
338 def __get__( self ): return self.frame
339 def __set__( self, value ):
340 self.is_modified = True
344 '''feature attributes (as a string).'''
345 def __get__( self ): return self.attributes
346 def __set__( self, value ):
347 self.is_modified = True
348 self.attributes = value
351 """parse attributes - return as dict
355 attributes = self.attributes
357 # separate into fields
358 fields = [ x.strip() for x in attributes.split(";")[:-1]]
364 d = [ x.strip() for x in f.split(" ")]
367 if len(d) > 2: v = d[1:]
369 if v[0] == '"' and v[-1] == '"':
372 ## try to convert to a value
385 def fromDict( self, d ):
386 '''set attributes from a dictionary.'''
390 # clean up if this field is set twice
391 if self.hasOwnAttributes:
392 free(self.attributes)
395 for k,v in d.items():
396 if type(v) == types.StringType:
397 aa.append( '%s "%s"' % (k,v) )
399 aa.append( '%s %s' % (k,str(v)) )
401 a = "; ".join( aa ) + ";"
404 self.attributes = <char *>calloc( l + 1, sizeof(char) )
405 if self.attributes == NULL:
406 raise ValueError("out of memory" )
407 memcpy( self.attributes, p, l )
409 self.hasOwnAttributes = True
410 self.is_modified = True
428 return TupleProxy.__str__(self)
430 def invert( self, int lcontig ):
431 '''invert coordinates to negative strand coordinates
433 This method will only act if the feature is on the
436 if self.strand[0] == '-':
437 start = min(self.start, self.end)
438 end = max(self.start, self.end)
439 self.start, self.end = lcontig - end, lcontig - start
442 '''return a list of attributes defined in this entry.'''
444 return [ x.strip().split(" ")[0] for x in r.split(";") if x.strip() != '' ]
446 def __getitem__(self, item):
447 return self.__getattr__( item )
449 def __getattr__(self, item ):
450 """Generic lookup of attribute from GFF/GTF attributes
451 Only called if there *isn't* an attribute with this name
460 start = strstr( self.attributes, query)
462 raise AttributeError("'GTFProxy' has no attribute '%s'" % item )
464 start += strlen(query) + 1
466 while start[0] == " ": start += 1
470 while end[0] != '\0' and end[0] != '"': end += 1
472 cpy = <char*>calloc( l, sizeof(char ) )
473 if cpy == NULL: raise ValueError("out of memory" )
474 memcpy( cpy, start, l )
482 def setAttribute( self, name, value ):
483 '''convenience method to set an attribute.'''
488 cdef class NamedTupleProxy( TupleProxy ):
492 def __setattr__(self, key, value ):
495 idx, f = self.map_key2field[key]
496 if self.nfields < idx:
497 raise KeyError( "field %s not set" % key )
498 TupleProxy.__setitem__(self, idx, str(value) )
500 def __getattr__(self, key ):
502 idx, f = self.map_key2field[key]
503 if self.nfields < idx:
504 raise KeyError( "field %s not set" % key )
505 return f( self.fields[idx] )
507 cdef class BedProxy( NamedTupleProxy ):
508 '''Proxy class for access to Bed fields.
510 This class represents a GTF entry for fast read-access.
517 'score' : (4, float),
519 'thickStart' : (6,int ),
520 'thickEnd' : (7,int),
522 'blockCount': (9,int),
523 'blockSizes': (10,str),
524 'blockStarts': (11,str), }
526 cdef int getMaxFields( self, size_t nbytes ):
527 '''return max number of fields.'''
530 cdef update( self, char * buffer, size_t nbytes ):
531 '''update internal data.
533 nbytes does not include the terminal '\0'.
535 TupleProxy.update( self, buffer, nbytes )
538 raise ValueError( "bed format requires at least three columns" )
540 # determines bed format
541 self.bedfields = self.nfields
543 # do automatic conversion
544 self.contig = self.fields[0]
545 self.start = atoi( self.fields[1] )
546 self.end = atoi( self.fields[2] )
548 # __setattr__ in base class seems to take precedence
549 # hence implement setters in __setattr__
551 # def __get__( self ): return self.start
553 # def __get__( self ): return self.end
557 cdef int save_fields = self.nfields
558 # ensure fields to use correct format
559 self.nfields = self.bedfields
560 retval = TupleProxy.__str__( self )
561 self.nfields = save_fields
564 def __setattr__(self, key, value ):
566 if key == "start": self.start = value
567 elif key == "end": self.end = value
570 idx, f = self.map_key2field[key]
571 TupleProxy._setindex(self, idx, str(value) )
573 cdef class VCFProxy( NamedTupleProxy ):
574 '''Proxy class for access to VCF fields.
576 The genotypes are accessed via index.
589 def __cinit__(self ):
590 # automatically calls TupleProxy.__cinit__
591 # start indexed access at genotypes
594 cdef update( self, char * buffer, size_t nbytes ):
595 '''update internal data.
597 nbytes does not include the terminal '\0'.
599 TupleProxy.update( self, buffer, nbytes )
601 self.contig = self.fields[0]
602 # vcf counts from 1 - correct here
603 self.pos = atoi( self.fields[1] ) - 1
606 return max(0, self.nfields - 9)
608 def __setattr__(self, key, value ):
615 idx, f = self.map_key2field[key]
616 TupleProxy._setindex(self, idx, str(value) )