X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=pysam.git;a=blobdiff_plain;f=pysam%2Fctabix.pyx;fp=pysam%2Fctabix.pyx;h=8715e5d4cf8f4c2e9ff275d001c4697ed7eb206b;hp=0000000000000000000000000000000000000000;hb=70e0c1963e5f83b0a6ab2aa139e1a8f30aa25c56;hpb=91f4887d1b19c44d68a2b19f0abee56de3dbb8ea diff --git a/pysam/ctabix.pyx b/pysam/ctabix.pyx new file mode 100644 index 0000000..8715e5d --- /dev/null +++ b/pysam/ctabix.pyx @@ -0,0 +1,881 @@ +# cython: embedsignature=True +# adds doc-strings for sphinx + +import tempfile, os, sys, types, itertools, struct, ctypes + +cdef class Tabixfile: + '''*(filename, mode='r')* + + opens a :term:`tabix file` for reading. A missing + index (*filename* + ".tbi") will raise an exception. + ''' + + cdef char * filename + + # pointer to tabixfile + cdef tabix_t * tabixfile + + def __cinit__(self, *args, **kwargs ): + self.tabixfile = NULL + self._open( *args, **kwargs ) + + def _isOpen( self ): + '''return true if samfile has been opened.''' + return self.tabixfile != NULL + + def _open( self, + char * filename, + mode ='r', + ): + '''open a :term:`tabix file` for reading. + ''' + + assert mode in ( "r",), "invalid file opening mode `%s`" % mode + + # close a previously opened file + if self.tabixfile != NULL: self.close() + self.tabixfile = NULL + + self.filename = filename + filename_index = filename + ".tbi" + + if mode[0] == 'w': + # open file for writing + pass + + elif mode[0] == "r": + # open file for reading + if not os.path.exists( self.filename ): + raise IOError( "file `%s` not found" % self.filename) + + if not os.path.exists( filename_index ): + raise IOError( "index `%s` not found" % filename_index) + + # open file and load index + self.tabixfile = ti_open( self.filename, filename_index ) + + if self.tabixfile == NULL: + raise IOError("could not open file `%s`" % filename ) + + def _parseRegion( self, + reference = None, + start = None, + end = None, + region = None ): + '''parse region information. + + raise ValueError for for invalid regions. + + returns a tuple of region, tid, start and end. Region + is a valid samtools :term:`region` or None if the region + extends over the whole file. + + Note that regions are 1-based, while start,end are python coordinates. + ''' + ti_lazy_index_load( self.tabixfile ) + + cdef int rtid + cdef int rstart + cdef int rend + cdef int max_pos + max_pos = 2 << 29 + + rtid = rstart = rend = 0 + + # translate to a region + if reference: + if start != None and end != None: + region = "%s:%i-%i" % (reference, start+1, end) + elif start == None and end != None: + region = "%s:%i-%i" % (reference, 1, end) + elif end == None and start != None: + region = "%s:%i-%i" % (reference, start+1, max_pos-1) + else: + region = reference + + if region: + ti_parse_region( self.tabixfile.idx, region, &rtid, &rstart, &rend) + if rtid < 0: raise ValueError( "invalid region `%s`" % region ) + if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) ) + if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart ) + if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend ) + + return region, rtid, rstart, rend + + def fetch( self, + reference = None, + start = None, + end = None, + region = None, + parser = None ): + ''' + + fetch one or more rows in a :term:`region` using 0-based indexing. The region is specified by + :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied. + + Without *reference* or *region* all entries will be fetched. + + If only *reference* is set, all reads matching on *reference* will be fetched. + + If *parser* is None, the results are returned as an unparsed string. + Otherwise, *parser* is assumed to be a functor that will return parsed + data (see for example :meth:`asTuple` and :meth:`asGTF`). + ''' + ti_lazy_index_load( self.tabixfile ) + + if not self._isOpen(): + raise ValueError( "I/O operation on closed file" ) + + region, rtid, rstart, rend = self._parseRegion( reference, start, end, region ) + + if parser == None: + if region: + return TabixIterator( self, rtid, rstart, rend ) + else: + return TabixIterator( self, -1, 0, 0 ) + else: + if region: + return TabixIteratorParsed( self, rtid, rstart, rend, parser ) + else: + return TabixIteratorParsed( self, -1, 0, 0, parser ) + + property contigs: + '''chromosome names''' + def __get__(self): + cdef char ** sequences + cdef int nsequences + + ti_lazy_index_load( self.tabixfile ) + sequences = ti_seqname( self.tabixfile.idx, &nsequences ) + cdef int x + result = [] + for x from 0 <= x < nsequences: + result.append( sequences[x] ) + return result + +cdef class TabixIterator: + """iterates over rows in *tabixfile* in region + given by *tid*, *start* and *end*. + """ + + cdef ti_iter_t iterator + cdef tabix_t * tabixfile + + def __cinit__(self, Tabixfile tabixfile, + int tid, int start, int end ): + + assert tabixfile._isOpen() + + # makes sure that samfile stays alive as long as the + # iterator is alive. + self.tabixfile = tabixfile.tabixfile + + if tid < 0: + # seek to start of file to ensure iteration is over + # all entries. + bgzf_seek( self.tabixfile.fp, 0, 0) + self.iterator = ti_iter_first() + else: + self.iterator = ti_queryi(self.tabixfile, tid, start, end) + + if self.iterator == NULL: + raise ValueError("malformatted query or wrong sequence name.\n") + + def __iter__(self): + return self + + def __next__(self): + """python version of next(). + + pyrex uses this non-standard name instead of next() + """ + + cdef char * s + cdef int len + s = ti_read(self.tabixfile, self.iterator, &len) + if s == NULL: raise StopIteration + return s + + def __dealloc__(self): + if self.iterator != NULL: + ti_iter_destroy(self.iterator) + +def toDot( v ): + '''convert value to '.' if None''' + if v == None: return "." + else: return str(v) + +def quote( v ): + '''return a quoted attribute.''' + if type(v) in types.StringTypes: + return '"%s"' % v + else: + return str(v) + +cdef class TupleProxy: + '''Proxy class for access to parsed row as a tuple. + + This class represents a table row for fast read-access. + ''' + + cdef: + char * data + char ** fields + int nfields + int index + + def __cinit__(self ): + + self.data = NULL + self.fields = NULL + self.index = 0 + + cdef take( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Take ownership of the pointer. + ''' + self.data = buffer + self.update( buffer, nbytes ) + + cdef present( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Do not take ownership of the pointer. + ''' + self.update( buffer, nbytes ) + + cdef copy( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Take a copy of buffer. + ''' + cdef int s + # +1 for '\0' + s = sizeof(char) * (nbytes + 1) + self.data = malloc( s ) + memcpy( self.data, buffer, s ) + self.update( self.data, nbytes ) + + cdef update( self, char * buffer, size_t nbytes ): + '''update internal data.''' + cdef char * pos + cdef char * old_pos + cdef int field + cdef int max_fields + field = 0 + + if buffer[nbytes] != 0: + raise ValueError( "incomplete line at %s" % buffer ) + + if self.fields != NULL: + free(self.fields) + + max_fields = nbytes / 4 + self.fields = calloc( max_fields, sizeof(char *) ) + + pos = buffer + self.fields[0] = pos + field += 1 + old_pos = pos + + while 1: + + pos = memchr( pos, '\t', nbytes ) + if pos == NULL: break + pos[0] = '\0' + pos += 1 + self.fields[field] = pos + field += 1 + if field >= max_fields: + raise ValueError("row too large - more than %i fields" % max_fields ) + nbytes -= pos - old_pos + if nbytes < 0: break + old_pos = pos + + self.nfields = field + + def __getitem__( self, key ): + + cdef int i + i = key + if i < 0: i += self.nfields + if i >= self.nfields or i < 0: + raise IndexError( "list index out of range" ) + return self.fields[i] + + def __len__(self): + return self.nfields + + def __dealloc__(self): + if self.data != NULL: + free(self.data) + + def __iter__(self): + self.index = 0 + return self + + def __next__(self): + """python version of next(). + """ + if self.index >= self.nfields: + raise StopIteration + self.index += 1 + return self.fields[self.index-1] + +cdef class GTFProxy: + '''Proxy class for access to GTF fields. + + This class represents a GTF entry for fast read-access. + Write-access has been added as well, though some care must + be taken. If any of the string fields (contig, source, ...) + are set, the new value is tied to the lifetime of the + argument that was supplied. + + The only exception is the attributes field when set from + a dictionary - this field will manage its own memory. + + ''' + + cdef: + char * contig + char * source + char * feature + uint32_t start + uint32_t end + char * score + char * strand + char * frame + char * attributes + int nbytes + char * data + cdef bint isModified + cdef bint hasOwnAttributes + + def __cinit__(self ): + self.data = NULL + self.isModified = False + self.hasOwnAttributes = False + + cdef take( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Take ownership of the pointer. + ''' + self.data = buffer + self.update( buffer, nbytes ) + self.isModified = False + + cdef present( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Do not take ownership of the pointer. + ''' + self.update( buffer, nbytes ) + self.isModified = False + + cdef copy( self, char * buffer, size_t nbytes ): + '''start presenting buffer. + + Take a copy of buffer. + ''' + cdef int s + # +1 for '\0' + s = sizeof(char) * (nbytes + 1) + self.data = malloc( s ) + memcpy( self.data, buffer, s ) + self.update( self.data, nbytes ) + self.isModified = False + + cdef update( self, char * buffer, size_t nbytes ): + '''update internal data. + + nbytes does not include the terminal '\0'. + ''' + cdef int end + cdef char * cstart, * cend, * cscore + self.contig = buffer + self.nbytes = nbytes + cdef char * pos + + if buffer[nbytes] != 0: + raise ValueError( "incomplete line at %s" % buffer ) + + pos = strchr( buffer, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + self.source = pos + + pos = strchr( pos, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + self.feature = pos + + pos = strchr( pos, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + cstart = pos + + pos = strchr( pos, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + cend = pos + + pos = strchr( pos, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + self.score = pos + + pos = strchr( pos, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + self.strand = pos + + pos = strchr( pos, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + self.frame = pos + + pos = strchr( pos, '\t' ) + if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer ) + pos[0] = '\0' + pos += 1 + self.attributes = pos + self.start = atoi( cstart ) - 1 + self.end = atoi( cend ) + + property contig: + '''contig of feature.''' + def __get__( self ): return self.contig + def __set__( self, value ): + self.isModified = True + self.contig = value + + property feature: + '''feature name.''' + def __get__( self ): return self.feature + def __set__( self, value ): + self.isModified = True + self.feature = value + + property source: + '''feature source.''' + def __get__( self ): return self.source + def __set__( self, value ): + self.isModified = True + self.source = value + + property start: + '''feature start (in 0-based open/closed coordinates).''' + def __get__( self ): return self.start + def __set__( self, value ): + self.isModified = True + self.start = value + + property end: + '''feature end (in 0-based open/closed coordinates).''' + def __get__( self ): return self.end + def __set__( self, value ): + self.isModified = True + self.end = value + + property score: + '''feature score.''' + def __get__( self ): + if self.score[0] == '.' and self.score[1] == '\0' : + return None + else: + return atof(self.score) + def __set__( self, value ): + self.isModified = True + self.score = value + + property strand: + '''feature strand.''' + def __get__( self ): return self.strand + def __set__( self, value ): + self.isModified = True + self.strand = value + + property frame: + '''feature frame.''' + def __get__( self ): return self.frame + def __set__( self, value ): + self.isModified = True + self.frame = value + + property attributes: + '''feature attributes (as a string).''' + def __get__( self ): return self.attributes + def __set__( self, value ): + self.isModified = True + self.attributes = value + + def asDict( self ): + """parse attributes - return as dict + """ + + # remove comments + attributes = self.attributes + + # separate into fields + fields = [ x.strip() for x in attributes.split(";")[:-1]] + + result = {} + + for f in fields: + + d = [ x.strip() for x in f.split(" ")] + + n,v = d[0], d[1] + if len(d) > 2: v = d[1:] + + if v[0] == '"' and v[-1] == '"': + v = v[1:-1] + else: + ## try to convert to a value + try: + v = float( v ) + v = int( v ) + except ValueError: + pass + except TypeError: + pass + + result[n] = v + + return result + + def fromDict( self, d ): + '''set attributes from a dictionary.''' + cdef char * p + cdef int l + + # clean up if this field is set twice + if self.hasOwnAttributes: + free(self.attributes) + + aa = [] + for k,v in d.items(): + if type(v) == types.StringType: + aa.append( '%s "%s"' % (k,v) ) + else: + aa.append( '%s %s' % (k,str(v)) ) + + a = "; ".join( aa ) + ";" + p = a + l = len(a) + self.attributes = calloc( l + 1, sizeof(char) ) + memcpy( self.attributes, p, l ) + + self.hasOwnAttributes = True + self.isModified = True + + def __str__(self): + cdef char * cpy + cdef int x + + if self.isModified: + return "\t".join( + (self.contig, + self.source, + self.feature, + str(self.start+1), + str(self.end), + toDot(self.score), + self.strand, + self.frame, + self.attributes ) ) + else: + cpy = calloc( sizeof(char), self.nbytes+1 ) + memcpy( cpy, self.data, self.nbytes+1) + for x from 0 <= x < self.nbytes: + if cpy[x] == '\0': cpy[x] = '\t' + result = cpy + free(cpy) + return result + + def invert( self, int lcontig ): + '''invert coordinates to negative strand coordinates + + This method will only act if the feature is on the + negative strand.''' + + if self.strand[0] == '-': + start = min(self.start, self.end) + end = max(self.start, self.end) + self.start, self.end = lcontig - end, lcontig - start + + def keys( self ): + '''return a list of attributes defined in this entry.''' + r = self.attributes + return [ x.strip().split(" ")[0] for x in r.split(";") if x.strip() != '' ] + + def __getitem__(self, item): + return self.__getattr__( item ) + + def __dealloc__(self): + if self.data != NULL: + free(self.data) + if self.hasOwnAttributes: + free(self.attributes) + + def __getattr__(self, item ): + """Generic lookup of attribute from GFF/GTF attributes + Only called if there *isn't* an attribute with this name + """ + cdef char * start + cdef char * query + cdef char * cpy + cdef char * end + cdef int l + query = item + + start = strstr( self.attributes, query) + if start == NULL: + raise AttributeError("'GTFProxy' has no attribute '%s'" % item ) + + start += strlen(query) + 1 + # skip gaps before + while start[0] == " ": start += 1 + if start[0] == '"': + start += 1 + end = start + while end[0] != '\0' and end[0] != '"': end += 1 + l = end - start + 1 + cpy = calloc( l, sizeof(char ) ) + memcpy( cpy, start, l ) + cpy[l-1] = '\0' + result = cpy + free(cpy) + return result + else: + return start + + def setAttribute( self, name, value ): + '''convenience method to set an attribute.''' + r = self.asDict() + r[name] = value + self.fromDict( r ) + +cdef class Parser: + pass + +cdef class asTuple(Parser): + '''converts a :term:`tabix row` into a python tuple.''' + def __call__(self, char * buffer, int len): + cdef TupleProxy r + r = TupleProxy() + # need to copy - there were some + # persistence issues with "present" + r.copy( buffer, len ) + return r + +cdef class asGTF(Parser): + '''converts a :term:`tabix row` into a GTF record.''' + def __call__(self, char * buffer, int len): + cdef GTFProxy r + r = GTFProxy() + r.copy( buffer, len ) + return r + +cdef class TabixIteratorParsed: + """iterates over mapped reads in a region. + """ + + cdef ti_iter_t iterator + cdef tabix_t * tabixfile + cdef Parser parser + + def __cinit__(self, + Tabixfile tabixfile, + int tid, + int start, + int end, + Parser parser ): + + assert tabixfile._isOpen() + self.parser = parser + + # makes sure that samfile stays alive as long as the + # iterator is alive. + self.tabixfile = tabixfile.tabixfile + + if tid < 0: + # seek to start of file to ensure iteration is over + # all entries. + bgzf_seek( self.tabixfile.fp, 0, 0) + self.iterator = ti_iter_first() + else: + self.iterator = ti_queryi(self.tabixfile, tid, start, end) + + if self.iterator == NULL: + raise ValueError("malformatted query or wrong sequence name.\n") + + def __iter__(self): + return self + + def __next__(self): + """python version of next(). + + pyrex uses this non-standard name instead of next() + """ + + cdef char * s + cdef int len + s = ti_read(self.tabixfile, self.iterator, &len) + if s == NULL: raise StopIteration + return self.parser(s, len) + + def __dealloc__(self): + if self.iterator != NULL: + ti_iter_destroy(self.iterator) + +def tabix_compress( filename_in, + filename_out, + force = False ): + + ''' + compress *filename_in* writing the output to *filename_out*. + + Raise an IOError if *filename_out* already exists, unless *force* is set. + ''' + + if not force and os.path.exists(filename_out ): + raise IOError( "Filename '%s' already exists, use *force* to overwrite" % filename_out) + + cdef int WINDOW_SIZE + cdef int c, r + cdef void * buffer + cdef BGZF * fp + cdef int fd_src + + cdef int O_RDONLY + O_RDONLY = os.O_RDONLY + + WINDOW_SIZE = 64 * 1024 + + fp = bgzf_open( filename_out, "w") + if fp == NULL: + raise IOError( "could not open '%s' for writing" ) + + fd_src = open(filename_in, O_RDONLY) + if fd_src == 0: + raise IOError( "could not open '%s' for reading" ) + + buffer = malloc(WINDOW_SIZE) + + while c > 0: + c = read(fd_src, buffer, WINDOW_SIZE) + r = bgzf_write(fp, buffer, c) + if r < 0: + free( buffer ) + raise OSError("writing failed") + + free( buffer ) + r = bgzf_close(fp) + if r < 0: raise OSError("writing failed") + +def tabix_index( filename, + force = False, + seq_col = None, + start_col = None, + end_col = None, + preset = None, + meta_char = "#", + zerobased = False, + ): + ''' + index tab-separated *filename* using tabix. + + An existing index will not be overwritten unless + *force* is set. + + The index will be built from coordinates + in columns *seq_col*, *start_col* and *end_col*. + + The contents of *filename* have to be sorted by + contig and position - the method does not check + if the file is sorted. + + Column indices are 0-based. Coordinates in the file + are assumed to be 1-based. + + If *preset* is provided, the column coordinates + are taken from a preset. Valid values for preset + are "gff", "bed", "sam", "vcf", psltbl", "pileup". + + Lines beginning with *meta_char* and the first + *line_skip* lines will be skipped. + + If *filename* does not end in ".gz", it will be automatically + compressed. The original file will be removed and only the + compressed file will be retained. + + If *filename* ends in *gz*, the file is assumed to be already + compressed with bgzf. + + returns the filename of the compressed data + ''' + + if not os.path.exists(filename): raise IOError("No such file '%s'" % filename) + + if not filename.endswith(".gz"): + + tabix_compress( filename, filename + ".gz", force = force ) + os.unlink( filename ) + filename += ".gz" + + if not force and os.path.exists(filename + ".tbi" ): + raise IOError( "Filename '%s.tbi' already exists, use *force* to overwrite" ) + + # columns (1-based) + # preset-code, contig, start, end, metachar for commends, lines to ignore at beginning + # 0 is a missing column + preset2conf = { + 'gff' : ( 0, 1, 4, 5, ord('#'), 0 ), + 'bed' : ( 0x10000, 1, 2, 3, ord('#'), 0 ), + 'psltbl' : ( 0x10000, 15, 17, 18, ord('#'), 0 ), + 'sam' : ( 1, 3, 4, 0, ord('#'), 0 ), + 'vcf' : ( 2, 1, 2, 0, ord('#'), 0 ), + 'pileup': (3, 1, 2, 0, ord('#'), 0 ), + } + + if preset: + try: + conf_data = preset2conf[preset] + except KeyError: + raise KeyError( "unknown preset '%s', valid presets are '%s'" % (preset, ",".join(preset2conf.keys() ))) + else: + if end_col == None: end_col = -1 + preset = 0 + + # note that tabix internally works with 0-based coordinates and open/closed intervals. + # When using a preset, conversion is automatically taken care of. + # Otherwise, the coordinates are assumed to be 1-based closed intervals and + # -1 is subtracted from the start coordinate. To avoid doing this, set + # the TI_FLAG_UCSC=0x10000 flag: + if zerobased: preset = preset | 0x10000 + + conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0) + + cdef ti_conf_t conf + conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data + + ti_index_build( filename, &conf) + + return filename + +__all__ = ["tabix_index", + "tabix_compress", + "Tabixfile", + "asTuple", + "asGTF", + ]