X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=pysam.git;a=blobdiff_plain;f=pysam%2FTabProxies.pyx;fp=pysam%2FTabProxies.pyx;h=36295ebe12ccba9584b2b30be907fe848601f537;hp=0000000000000000000000000000000000000000;hb=d02fe5283ed7a93a2f76a5d6dc6e37b40c11b9b1;hpb=d828f9c9aa78e3d1687265b52de841f3f3852089 diff --git a/pysam/TabProxies.pyx b/pysam/TabProxies.pyx new file mode 100644 index 0000000..36295eb --- /dev/null +++ b/pysam/TabProxies.pyx @@ -0,0 +1,617 @@ +import types +from cpython cimport PyString_FromStringAndSize, PyString_AsString, PyString_AS_STRING + +cdef char * nextItem( char * buffer ): + cdef char * pos + pos = strchr( buffer, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + return pos + +cdef char *StrOrEmpty( char * buffer ): + if buffer == NULL: return "" + else: return buffer + +cdef int isNew( char * p, char * buffer, size_t nbytes ): + if p == NULL: return 0 + return not (buffer <= p < buffer + nbytes ) + +cdef class TupleProxy: + '''Proxy class for access to parsed row as a tuple. + + This class represents a table row for fast read-access. + + Access to individual fields is via the [] operator. + + Only read-only access is implemented. + ''' + + def __cinit__(self ): + self.data = NULL + self.fields = NULL + self.index = 0 + self.nbytes = 0 + self.is_modified = 0 + self.nfields = 0 + # start counting at field offset + self.offset = 0 + + def __dealloc__(self): + cdef int x + if self.is_modified: + for x from 0 <= x < self.nfields: + if isNew( self.fields[x], self.data, self.nbytes ): + free( self.fields[x] ) + self.fields[x] = NULL + + if self.data != NULL: free(self.data) + if self.fields != NULL: free( self.fields ) + + cdef take( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Take ownership of the pointer. + ''' + self.data = buffer + self.nbytes = nbytes + self.update( buffer, nbytes ) + + cdef present( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Do not take ownership of the pointer. + ''' + self.update( buffer, nbytes ) + + cdef copy( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Take a copy of buffer. + ''' + cdef int s + # +1 for '\0' + s = sizeof(char) * (nbytes + 1) + self.data = malloc( s ) + if self.data == NULL: + raise ValueError("out of memory" ) + self.nbytes = nbytes + memcpy( self.data, buffer, s ) + self.update( self.data, nbytes ) + + cdef int getMaxFields( self, size_t nbytes ): + '''initialize fields.''' + return nbytes / 2 + + cdef update( self, char * buffer, size_t nbytes ): + '''update internal data. + + Update starts work in buffer, thus can be used + to collect any number of fields until nbytes + is exhausted. + + If max_fields is set, the number of fields is initialized to max_fields. + + ''' + cdef char * pos + cdef char * old_pos + cdef int field + cdef int max_fields, x + + if buffer[nbytes] != 0: + raise ValueError( "incomplete line at %s" % buffer ) + + ################################# + # clear data + if self.fields != NULL: free(self.fields) + + for field from 0 <= field < self.nfields: + if isNew( self.fields[field], self.data, self.nbytes ): + free( self.fields[field] ) + + self.is_modified = self.nfields = 0 + + ################################# + # allocate new + max_fields = self.getMaxFields( nbytes ) + self.fields = calloc( max_fields, sizeof(char *) ) + if self.fields == NULL: + raise ValueError("out of memory" ) + + ################################# + # start filling + field = 0 + self.fields[field] = pos = buffer + field += 1 + old_pos = pos + + while 1: + + pos = memchr( pos, '\t', nbytes ) + if pos == NULL: break + pos[0] = '\0' + pos += 1 + self.fields[field] = pos + field += 1 + if field >= max_fields: + raise ValueError("row too large - more than %i fields" % max_fields ) + nbytes -= pos - old_pos + if nbytes < 0: break + old_pos = pos + + self.nfields = field + + def __getitem__( self, key ): + + cdef int i = key + if i < 0: i += self.nfields + if i < 0: raise IndexError( "list index out of range" ) + i += self.offset + if i >= self.nfields: + raise IndexError( "list index out of range" ) + return self.fields[i] + + def _setindex( self, index, value ): + '''set item at idx index.''' + cdef int idx = index + if idx < 0: raise IndexError( "list index out of range" ) + if idx >= self.nfields: + raise IndexError( "list index out of range" ) + + if isNew( self.fields[idx], self.data, self.nbytes ): + free( self.fields[idx] ) + + self.is_modified = 1 + + if value == None: + self.fields[idx] = NULL + return + + # conversion with error checking + cdef char * tmp = PyString_AsString( value ) + self.fields[idx] = malloc( (strlen( tmp ) + 1) * sizeof(char) ) + if self.fields[idx] == NULL: + raise ValueError("out of memory" ) + strcpy( self.fields[idx], tmp ) + + def __setitem__(self, index, value ): + '''set item at *index* to *value*''' + cdef int i = index + if i < 0: i += self.nfields + i += self.offset + + self._setindex( i, value ) + + def __len__(self): + return self.nfields + + def __iter__(self): + self.index = 0 + return self + + def __next__(self): + """python version of next(). + """ + if self.index >= self.nfields: + raise StopIteration + cdef char * retval = self.fields[self.index] + self.index += 1 + if retval == NULL: return None + else: return retval + + def __str__(self): + '''return original data''' + # copy and replace \0 bytes with \t characters + if self.is_modified: + # todo: treat NULL values + return "\t".join( [StrOrEmpty( self.fields[x]) for x in xrange(0, self.nfields ) ] ) + else: + cpy = calloc( sizeof(char), self.nbytes+1 ) + if cpy == NULL: + raise ValueError("out of memory" ) + memcpy( cpy, self.data, self.nbytes+1) + for x from 0 <= x < self.nbytes: + if cpy[x] == '\0': cpy[x] = '\t' + result = PyString_FromStringAndSize(cpy, self.nbytes) + free(cpy) + return result + +def toDot( v ): + '''convert value to '.' if None''' + if v == None: return "." + else: return str(v) + +def quote( v ): + '''return a quoted attribute.''' + if type(v) in types.StringTypes: + return '"%s"' % v + else: + return str(v) + +cdef class GTFProxy( TupleProxy ): + '''Proxy class for access to GTF fields. + + This class represents a GTF entry for fast read-access. + Write-access has been added as well, though some care must + be taken. If any of the string fields (contig, source, ...) + are set, the new value is tied to the lifetime of the + argument that was supplied. + + The only exception is the attributes field when set from + a dictionary - this field will manage its own memory. + + ''' + + def __cinit__(self ): + # automatically calls TupleProxy.__cinit__ + self.hasOwnAttributes = False + + def __dealloc__(self): + # automatically calls TupleProxy.__dealloc__ + if self.hasOwnAttributes: + free(self.attributes) + + cdef int getMaxFields( self, size_t nbytes ): + '''return max number of fields.''' + return 9 + + cdef update( self, char * buffer, size_t nbytes ): + '''update internal data. + + nbytes does not include the terminal '\0'. + ''' + cdef int end + cdef char * cstart, * cend, * cscore + self.contig = buffer + cdef char * pos + + if buffer[nbytes] != 0: + raise ValueError( "incomplete line at %s" % buffer ) + + self.source = pos = nextItem( buffer ) + self.feature = pos = nextItem( pos ) + cstart = pos = nextItem( pos ) + cend = pos = nextItem( pos ) + self.score = pos = nextItem( pos ) + self.strand = pos = nextItem( pos ) + self.frame = pos = nextItem( pos ) + self.attributes = pos = nextItem( pos ) + + self.start = atoi( cstart ) - 1 + self.end = atoi( cend ) + + property contig: + '''contig of feature.''' + def __get__( self ): return self.contig + def __set__( self, value ): + self.is_modified = True + self.contig = value + + property feature: + '''feature name.''' + def __get__( self ): return self.feature + def __set__( self, value ): + self.is_modified = True + self.feature = value + + property source: + '''feature source.''' + def __get__( self ): return self.source + def __set__( self, value ): + self.is_modified = True + self.source = value + + property start: + '''feature start (in 0-based open/closed coordinates).''' + def __get__( self ): return self.start + def __set__( self, value ): + self.is_modified = True + self.start = value + + property end: + '''feature end (in 0-based open/closed coordinates).''' + def __get__( self ): return self.end + def __set__( self, value ): + self.is_modified = True + self.end = value + + property score: + '''feature score.''' + def __get__( self ): + if self.score[0] == '.' and self.score[1] == '\0' : + return None + else: + return atof(self.score) + def __set__( self, value ): + self.is_modified = True + self.score = value + + property strand: + '''feature strand.''' + def __get__( self ): return self.strand + def __set__( self, value ): + self.is_modified = True + self.strand = value + + property frame: + '''feature frame.''' + def __get__( self ): return self.frame + def __set__( self, value ): + self.is_modified = True + self.frame = value + + property attributes: + '''feature attributes (as a string).''' + def __get__( self ): return self.attributes + def __set__( self, value ): + self.is_modified = True + self.attributes = value + + def asDict( self ): + """parse attributes - return as dict + """ + + # remove comments + attributes = self.attributes + + # separate into fields + fields = [ x.strip() for x in attributes.split(";")[:-1]] + + result = {} + + for f in fields: + + d = [ x.strip() for x in f.split(" ")] + + n,v = d[0], d[1] + if len(d) > 2: v = d[1:] + + if v[0] == '"' and v[-1] == '"': + v = v[1:-1] + else: + ## try to convert to a value + try: + v = float( v ) + v = int( v ) + except ValueError: + pass + except TypeError: + pass + + result[n] = v + + return result + + def fromDict( self, d ): + '''set attributes from a dictionary.''' + cdef char * p + cdef int l + + # clean up if this field is set twice + if self.hasOwnAttributes: + free(self.attributes) + + aa = [] + for k,v in d.items(): + if type(v) == types.StringType: + aa.append( '%s "%s"' % (k,v) ) + else: + aa.append( '%s %s' % (k,str(v)) ) + + a = "; ".join( aa ) + ";" + p = a + l = len(a) + self.attributes = calloc( l + 1, sizeof(char) ) + if self.attributes == NULL: + raise ValueError("out of memory" ) + memcpy( self.attributes, p, l ) + + self.hasOwnAttributes = True + self.is_modified = True + + def __str__(self): + cdef char * cpy + cdef int x + + if self.is_modified: + return "\t".join( + (self.contig, + self.source, + self.feature, + str(self.start+1), + str(self.end), + toDot(self.score), + self.strand, + self.frame, + self.attributes ) ) + else: + return TupleProxy.__str__(self) + + def invert( self, int lcontig ): + '''invert coordinates to negative strand coordinates + + This method will only act if the feature is on the + negative strand.''' + + if self.strand[0] == '-': + start = min(self.start, self.end) + end = max(self.start, self.end) + self.start, self.end = lcontig - end, lcontig - start + + def keys( self ): + '''return a list of attributes defined in this entry.''' + r = self.attributes + return [ x.strip().split(" ")[0] for x in r.split(";") if x.strip() != '' ] + + def __getitem__(self, item): + return self.__getattr__( item ) + + def __getattr__(self, item ): + """Generic lookup of attribute from GFF/GTF attributes + Only called if there *isn't* an attribute with this name + """ + cdef char * start + cdef char * query + cdef char * cpy + cdef char * end + cdef int l + query = item + + start = strstr( self.attributes, query) + if start == NULL: + raise AttributeError("'GTFProxy' has no attribute '%s'" % item ) + + start += strlen(query) + 1 + # skip gaps before + while start[0] == " ": start += 1 + if start[0] == '"': + start += 1 + end = start + while end[0] != '\0' and end[0] != '"': end += 1 + l = end - start + 1 + cpy = calloc( l, sizeof(char ) ) + if cpy == NULL: raise ValueError("out of memory" ) + memcpy( cpy, start, l ) + cpy[l-1] = '\0' + result = cpy + free(cpy) + return result + else: + return start + + def setAttribute( self, name, value ): + '''convenience method to set an attribute.''' + r = self.asDict() + r[name] = value + self.fromDict( r ) + +cdef class NamedTupleProxy( TupleProxy ): + + map_key2field = {} + + def __setattr__(self, key, value ): + '''set attribute.''' + cdef int idx + idx, f = self.map_key2field[key] + if self.nfields < idx: + raise KeyError( "field %s not set" % key ) + TupleProxy.__setitem__(self, idx, str(value) ) + + def __getattr__(self, key ): + cdef int idx + idx, f = self.map_key2field[key] + if self.nfields < idx: + raise KeyError( "field %s not set" % key ) + return f( self.fields[idx] ) + +cdef class BedProxy( NamedTupleProxy ): + '''Proxy class for access to Bed fields. + + This class represents a GTF entry for fast read-access. + ''' + map_key2field = { + 'contig' : (0, str), + 'start' : (1, int), + 'end' : (2, int), + 'name' : (3, str), + 'score' : (4, float), + 'strand' : (5, str), + 'thickStart' : (6,int ), + 'thickEnd' : (7,int), + 'itemRGB' : (8,str), + 'blockCount': (9,int), + 'blockSizes': (10,str), + 'blockStarts': (11,str), } + + cdef int getMaxFields( self, size_t nbytes ): + '''return max number of fields.''' + return 12 + + cdef update( self, char * buffer, size_t nbytes ): + '''update internal data. + + nbytes does not include the terminal '\0'. + ''' + TupleProxy.update( self, buffer, nbytes ) + + if self.nfields < 3: + raise ValueError( "bed format requires at least three columns" ) + + # determines bed format + self.bedfields = self.nfields + + # do automatic conversion + self.contig = self.fields[0] + self.start = atoi( self.fields[1] ) + self.end = atoi( self.fields[2] ) + + # __setattr__ in base class seems to take precedence + # hence implement setters in __setattr__ + #property start: + # def __get__( self ): return self.start + #property end: + # def __get__( self ): return self.end + + def __str__(self): + + cdef int save_fields = self.nfields + # ensure fields to use correct format + self.nfields = self.bedfields + retval = TupleProxy.__str__( self ) + self.nfields = save_fields + return retval + + def __setattr__(self, key, value ): + '''set attribute.''' + if key == "start": self.start = value + elif key == "end": self.end = value + + cdef int idx + idx, f = self.map_key2field[key] + TupleProxy._setindex(self, idx, str(value) ) + +cdef class VCFProxy( NamedTupleProxy ): + '''Proxy class for access to VCF fields. + + The genotypes are accessed via index. + ''' + map_key2field = { + 'contig' : (0, str), + 'pos' : (1, int), + 'id' : (2, str), + 'ref' : (3, str), + 'alt' : (4, str), + 'qual' : (5, str), + 'filter' : (6,str), + 'info' : (7,str), + 'format' : (8,str) } + + def __cinit__(self ): + # automatically calls TupleProxy.__cinit__ + # start indexed access at genotypes + self.offset = 9 + + cdef update( self, char * buffer, size_t nbytes ): + '''update internal data. + + nbytes does not include the terminal '\0'. + ''' + TupleProxy.update( self, buffer, nbytes ) + + self.contig = self.fields[0] + # vcf counts from 1 - correct here + self.pos = atoi( self.fields[1] ) - 1 + + def __len__(self): + return max(0, self.nfields - 9) + + def __setattr__(self, key, value ): + '''set attribute.''' + if key == "pos": + self.pos = value + value += 1 + + cdef int idx + idx, f = self.map_key2field[key] + TupleProxy._setindex(self, idx, str(value) ) +