X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=pysam.git;a=blobdiff_plain;f=pysam%2Fctabix.pyx;fp=pysam%2Fctabix.pyx;h=25c1a1ee732df3e383a29e80c1ea2ae533b7925e;hp=8715e5d4cf8f4c2e9ff275d001c4697ed7eb206b;hb=bd0c3067c187d1f718004fb38acc093af8810a02;hpb=1b740fc70684c92a5e2293013217d5a2fd661d8a diff --git a/pysam/ctabix.pyx b/pysam/ctabix.pyx index 8715e5d..25c1a1e 100644 --- a/pysam/ctabix.pyx +++ b/pysam/ctabix.pyx @@ -1,7 +1,9 @@ # cython: embedsignature=True # adds doc-strings for sphinx -import tempfile, os, sys, types, itertools, struct, ctypes +import tempfile, os, sys, types, itertools, struct, ctypes, gzip +from cpython cimport PyString_FromStringAndSize, PyString_AS_STRING +cimport TabProxies cdef class Tabixfile: '''*(filename, mode='r')* @@ -10,11 +12,6 @@ cdef class Tabixfile: index (*filename* + ".tbi") will raise an exception. ''' - cdef char * filename - - # pointer to tabixfile - cdef tabix_t * tabixfile - def __cinit__(self, *args, **kwargs ): self.tabixfile = NULL self._open( *args, **kwargs ) @@ -139,19 +136,27 @@ cdef class Tabixfile: else: return TabixIteratorParsed( self, -1, 0, 0, parser ) + property header: + def __get__( self ): + '''return header lines as an iterator. + + Note that the header lines do not contain the newline '\n' character. + ''' + return TabixHeaderIterator( self ) + property contigs: - '''chromosome names''' - def __get__(self): - cdef char ** sequences - cdef int nsequences + '''chromosome names''' + def __get__(self): + cdef char ** sequences + cdef int nsequences - ti_lazy_index_load( self.tabixfile ) - sequences = ti_seqname( self.tabixfile.idx, &nsequences ) - cdef int x - result = [] - for x from 0 <= x < nsequences: - result.append( sequences[x] ) - return result + ti_lazy_index_load( self.tabixfile ) + sequences = ti_seqname( self.tabixfile.idx, &nsequences ) + cdef int x + result = [] + for x from 0 <= x < nsequences: + result.append( sequences[x] ) + return result cdef class TabixIterator: """iterates over rows in *tabixfile* in region @@ -192,487 +197,78 @@ cdef class TabixIterator: cdef char * s cdef int len - s = ti_read(self.tabixfile, self.iterator, &len) - if s == NULL: raise StopIteration + # metachar filtering does not work within tabix + # though it should. Getting the metachar is a pain + # as ti_index_t is incomplete type. + + # simply use '#' for now. + while 1: + s = ti_read(self.tabixfile, self.iterator, &len) + if s == NULL: raise StopIteration + if s[0] != '#': break + return s def __dealloc__(self): if self.iterator != NULL: ti_iter_destroy(self.iterator) -def toDot( v ): - '''convert value to '.' if None''' - if v == None: return "." - else: return str(v) - -def quote( v ): - '''return a quoted attribute.''' - if type(v) in types.StringTypes: - return '"%s"' % v - else: - return str(v) - -cdef class TupleProxy: - '''Proxy class for access to parsed row as a tuple. - - This class represents a table row for fast read-access. - ''' - - cdef: - char * data - char ** fields - int nfields - int index - - def __cinit__(self ): - - self.data = NULL - self.fields = NULL - self.index = 0 - - cdef take( self, char * buffer, size_t nbytes ): - '''start presenting buffer. - - Take ownership of the pointer. - ''' - self.data = buffer - self.update( buffer, nbytes ) - - cdef present( self, char * buffer, size_t nbytes ): - '''start presenting buffer. - - Do not take ownership of the pointer. - ''' - self.update( buffer, nbytes ) - - cdef copy( self, char * buffer, size_t nbytes ): - '''start presenting buffer. +cdef class TabixHeaderIterator: + """return header lines. + """ + + cdef ti_iter_t iterator + cdef tabix_t * tabixfile - Take a copy of buffer. - ''' - cdef int s - # +1 for '\0' - s = sizeof(char) * (nbytes + 1) - self.data = malloc( s ) - memcpy( self.data, buffer, s ) - self.update( self.data, nbytes ) - - cdef update( self, char * buffer, size_t nbytes ): - '''update internal data.''' - cdef char * pos - cdef char * old_pos - cdef int field - cdef int max_fields - field = 0 - - if buffer[nbytes] != 0: - raise ValueError( "incomplete line at %s" % buffer ) + def __cinit__(self, Tabixfile tabixfile ): - if self.fields != NULL: - free(self.fields) - - max_fields = nbytes / 4 - self.fields = calloc( max_fields, sizeof(char *) ) - - pos = buffer - self.fields[0] = pos - field += 1 - old_pos = pos + assert tabixfile._isOpen() - while 1: - - pos = memchr( pos, '\t', nbytes ) - if pos == NULL: break - pos[0] = '\0' - pos += 1 - self.fields[field] = pos - field += 1 - if field >= max_fields: - raise ValueError("row too large - more than %i fields" % max_fields ) - nbytes -= pos - old_pos - if nbytes < 0: break - old_pos = pos - - self.nfields = field - - def __getitem__( self, key ): - - cdef int i - i = key - if i < 0: i += self.nfields - if i >= self.nfields or i < 0: - raise IndexError( "list index out of range" ) - return self.fields[i] + # makes sure that samfile stays alive as long as the + # iterator is alive. + self.tabixfile = tabixfile.tabixfile - def __len__(self): - return self.nfields + self.iterator = ti_query(self.tabixfile, NULL, 0, 0) - def __dealloc__(self): - if self.data != NULL: - free(self.data) + if self.iterator == NULL: + raise ValueError("can't open header.\n") def __iter__(self): - self.index = 0 - return self + return self def __next__(self): """python version of next(). - """ - if self.index >= self.nfields: - raise StopIteration - self.index += 1 - return self.fields[self.index-1] -cdef class GTFProxy: - '''Proxy class for access to GTF fields. - - This class represents a GTF entry for fast read-access. - Write-access has been added as well, though some care must - be taken. If any of the string fields (contig, source, ...) - are set, the new value is tied to the lifetime of the - argument that was supplied. - - The only exception is the attributes field when set from - a dictionary - this field will manage its own memory. - - ''' - - cdef: - char * contig - char * source - char * feature - uint32_t start - uint32_t end - char * score - char * strand - char * frame - char * attributes - int nbytes - char * data - cdef bint isModified - cdef bint hasOwnAttributes - - def __cinit__(self ): - self.data = NULL - self.isModified = False - self.hasOwnAttributes = False - - cdef take( self, char * buffer, size_t nbytes ): - '''start presenting buffer. - - Take ownership of the pointer. - ''' - self.data = buffer - self.update( buffer, nbytes ) - self.isModified = False - - cdef present( self, char * buffer, size_t nbytes ): - '''start presenting buffer. - - Do not take ownership of the pointer. - ''' - self.update( buffer, nbytes ) - self.isModified = False - - cdef copy( self, char * buffer, size_t nbytes ): - '''start presenting buffer. - - Take a copy of buffer. - ''' - cdef int s - # +1 for '\0' - s = sizeof(char) * (nbytes + 1) - self.data = malloc( s ) - memcpy( self.data, buffer, s ) - self.update( self.data, nbytes ) - self.isModified = False - - cdef update( self, char * buffer, size_t nbytes ): - '''update internal data. - - nbytes does not include the terminal '\0'. - ''' - cdef int end - cdef char * cstart, * cend, * cscore - self.contig = buffer - self.nbytes = nbytes - cdef char * pos - - if buffer[nbytes] != 0: - raise ValueError( "incomplete line at %s" % buffer ) - - pos = strchr( buffer, '\t' ) - if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) - pos[0] = '\0' - pos += 1 - self.source = pos - - pos = strchr( pos, '\t' ) - if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) - pos[0] = '\0' - pos += 1 - self.feature = pos - - pos = strchr( pos, '\t' ) - if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) - pos[0] = '\0' - pos += 1 - cstart = pos - - pos = strchr( pos, '\t' ) - if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) - pos[0] = '\0' - pos += 1 - cend = pos - - pos = strchr( pos, '\t' ) - if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) - pos[0] = '\0' - pos += 1 - self.score = pos - - pos = strchr( pos, '\t' ) - if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) - pos[0] = '\0' - pos += 1 - self.strand = pos - - pos = strchr( pos, '\t' ) - if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) - pos[0] = '\0' - pos += 1 - self.frame = pos - - pos = strchr( pos, '\t' ) - if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) - pos[0] = '\0' - pos += 1 - self.attributes = pos - self.start = atoi( cstart ) - 1 - self.end = atoi( cend ) - - property contig: - '''contig of feature.''' - def __get__( self ): return self.contig - def __set__( self, value ): - self.isModified = True - self.contig = value - - property feature: - '''feature name.''' - def __get__( self ): return self.feature - def __set__( self, value ): - self.isModified = True - self.feature = value - - property source: - '''feature source.''' - def __get__( self ): return self.source - def __set__( self, value ): - self.isModified = True - self.source = value - - property start: - '''feature start (in 0-based open/closed coordinates).''' - def __get__( self ): return self.start - def __set__( self, value ): - self.isModified = True - self.start = value - - property end: - '''feature end (in 0-based open/closed coordinates).''' - def __get__( self ): return self.end - def __set__( self, value ): - self.isModified = True - self.end = value - - property score: - '''feature score.''' - def __get__( self ): - if self.score[0] == '.' and self.score[1] == '\0' : - return None - else: - return atof(self.score) - def __set__( self, value ): - self.isModified = True - self.score = value - - property strand: - '''feature strand.''' - def __get__( self ): return self.strand - def __set__( self, value ): - self.isModified = True - self.strand = value - - property frame: - '''feature frame.''' - def __get__( self ): return self.frame - def __set__( self, value ): - self.isModified = True - self.frame = value - - property attributes: - '''feature attributes (as a string).''' - def __get__( self ): return self.attributes - def __set__( self, value ): - self.isModified = True - self.attributes = value - - def asDict( self ): - """parse attributes - return as dict + pyrex uses this non-standard name instead of next() """ - - # remove comments - attributes = self.attributes - - # separate into fields - fields = [ x.strip() for x in attributes.split(";")[:-1]] - - result = {} - - for f in fields: - - d = [ x.strip() for x in f.split(" ")] - - n,v = d[0], d[1] - if len(d) > 2: v = d[1:] - - if v[0] == '"' and v[-1] == '"': - v = v[1:-1] - else: - ## try to convert to a value - try: - v = float( v ) - v = int( v ) - except ValueError: - pass - except TypeError: - pass - - result[n] = v - - return result - def fromDict( self, d ): - '''set attributes from a dictionary.''' - cdef char * p - cdef int l - - # clean up if this field is set twice - if self.hasOwnAttributes: - free(self.attributes) - - aa = [] - for k,v in d.items(): - if type(v) == types.StringType: - aa.append( '%s "%s"' % (k,v) ) - else: - aa.append( '%s %s' % (k,str(v)) ) - - a = "; ".join( aa ) + ";" - p = a - l = len(a) - self.attributes = calloc( l + 1, sizeof(char) ) - memcpy( self.attributes, p, l ) - - self.hasOwnAttributes = True - self.isModified = True - - def __str__(self): - cdef char * cpy - cdef int x - - if self.isModified: - return "\t".join( - (self.contig, - self.source, - self.feature, - str(self.start+1), - str(self.end), - toDot(self.score), - self.strand, - self.frame, - self.attributes ) ) - else: - cpy = calloc( sizeof(char), self.nbytes+1 ) - memcpy( cpy, self.data, self.nbytes+1) - for x from 0 <= x < self.nbytes: - if cpy[x] == '\0': cpy[x] = '\t' - result = cpy - free(cpy) - return result - - def invert( self, int lcontig ): - '''invert coordinates to negative strand coordinates - - This method will only act if the feature is on the - negative strand.''' - - if self.strand[0] == '-': - start = min(self.start, self.end) - end = max(self.start, self.end) - self.start, self.end = lcontig - end, lcontig - start + cdef char * s + cdef int len - def keys( self ): - '''return a list of attributes defined in this entry.''' - r = self.attributes - return [ x.strip().split(" ")[0] for x in r.split(";") if x.strip() != '' ] + # Getting the metachar is a pain as ti_index_t is incomplete type. + # simply use '#' for now. + s = ti_read(self.tabixfile, self.iterator, &len) + if s == NULL: raise StopIteration + # stop at first non-header line + if s[0] != '#': raise StopIteration - def __getitem__(self, item): - return self.__getattr__( item ) + return s def __dealloc__(self): - if self.data != NULL: - free(self.data) - if self.hasOwnAttributes: - free(self.attributes) - - def __getattr__(self, item ): - """Generic lookup of attribute from GFF/GTF attributes - Only called if there *isn't* an attribute with this name - """ - cdef char * start - cdef char * query - cdef char * cpy - cdef char * end - cdef int l - query = item - - start = strstr( self.attributes, query) - if start == NULL: - raise AttributeError("'GTFProxy' has no attribute '%s'" % item ) - - start += strlen(query) + 1 - # skip gaps before - while start[0] == " ": start += 1 - if start[0] == '"': - start += 1 - end = start - while end[0] != '\0' and end[0] != '"': end += 1 - l = end - start + 1 - cpy = calloc( l, sizeof(char ) ) - memcpy( cpy, start, l ) - cpy[l-1] = '\0' - result = cpy - free(cpy) - return result - else: - return start - - def setAttribute( self, name, value ): - '''convenience method to set an attribute.''' - r = self.asDict() - r[name] = value - self.fromDict( r ) + if self.iterator != NULL: + ti_iter_destroy(self.iterator) +######################################################### +######################################################### +######################################################### cdef class Parser: pass cdef class asTuple(Parser): '''converts a :term:`tabix row` into a python tuple.''' def __call__(self, char * buffer, int len): - cdef TupleProxy r - r = TupleProxy() + cdef TabProxies.TupleProxy r + r = TabProxies.TupleProxy() # need to copy - there were some # persistence issues with "present" r.copy( buffer, len ) @@ -681,15 +277,36 @@ cdef class asTuple(Parser): cdef class asGTF(Parser): '''converts a :term:`tabix row` into a GTF record.''' def __call__(self, char * buffer, int len): - cdef GTFProxy r - r = GTFProxy() + cdef TabProxies.GTFProxy r + r = TabProxies.GTFProxy() r.copy( buffer, len ) return r +cdef class asBed( Parser ): + '''converts a :term:`tabix row` into a GTF record.''' + def __call__(self, char * buffer, int len): + cdef TabProxies.BedProxy r + r = TabProxies.BedProxy() + r.copy( buffer, len ) + return r + +cdef class asVCF( Parser ): + '''converts a :term:`tabix row` into a VCF record.''' + def __call__(self, char * buffer, int len ): + cdef TabProxies.VCFProxy r + r = TabProxies.VCFProxy() + r.copy( buffer, len ) + return r + +######################################################### +######################################################### +######################################################### cdef class TabixIteratorParsed: """iterates over mapped reads in a region. + + Returns parsed data. """ - + cdef ti_iter_t iterator cdef tabix_t * tabixfile cdef Parser parser @@ -730,8 +347,12 @@ cdef class TabixIteratorParsed: cdef char * s cdef int len - s = ti_read(self.tabixfile, self.iterator, &len) - if s == NULL: raise StopIteration + while 1: + s = ti_read(self.tabixfile, self.iterator, &len) + if s == NULL: raise StopIteration + # todo: read metachar from configuration + if s[0] != '#': break + return self.parser(s, len) def __dealloc__(self): @@ -878,4 +499,6 @@ __all__ = ["tabix_index", "Tabixfile", "asTuple", "asGTF", + "asVCF", + "asBed", ]