pysam/ctabix.pyx

   1 # cython: embedsignature=True
   2 # adds doc-strings for sphinx
   3
   4 import tempfile, os, sys, types, itertools, struct, ctypes, gzip
   5 from cpython cimport PyString_FromStringAndSize, PyString_AS_STRING
   6 cimport TabProxies
   7
   8 cdef class Tabixfile:
   9     '''*(filename, mode='r')*
  10
  11     opens a :term:`tabix file` for reading. A missing
  12     index (*filename* + ".tbi") will raise an exception.
  13     '''
  14
  15     def __cinit__(self, *args, **kwargs ):
  16         self.tabixfile = NULL
  17         self._open( *args, **kwargs )
  18
  19     def _isOpen( self ):
  20         '''return true if samfile has been opened.'''
  21         return self.tabixfile != NULL
  22
  23     def _open( self,
  24                char * filename,
  25                mode ='r',
  26               ):
  27         '''open a :term:`tabix file` for reading.
  28         '''
  29
  30         assert mode in ( "r",), "invalid file opening mode `%s`" % mode
  31
  32         # close a previously opened file
  33         if self.tabixfile != NULL: self.close()
  34         self.tabixfile = NULL
  35
  36         self.filename = filename
  37         filename_index = filename + ".tbi"
  38
  39         if mode[0] == 'w':
  40             # open file for writing
  41             pass
  42
  43         elif mode[0] == "r":
  44             # open file for reading
  45             if not os.path.exists( self.filename ):
  46                 raise IOError( "file `%s` not found" % self.filename)
  47
  48             if not os.path.exists( filename_index ):
  49                 raise IOError( "index `%s` not found" % filename_index)
  50
  51             # open file and load index
  52             self.tabixfile = ti_open( self.filename, filename_index )
  53
  54         if self.tabixfile == NULL:
  55             raise IOError("could not open file `%s`" % filename )
  56
  57     def _parseRegion( self,
  58                       reference = None,
  59                       start = None,
  60                       end = None,
  61                       region = None ):
  62         '''parse region information.
  63
  64         raise ValueError for for invalid regions.
  65
  66         returns a tuple of region, tid, start and end. Region
  67         is a valid samtools :term:`region` or None if the region
  68         extends over the whole file.
  69
  70         Note that regions are 1-based, while start,end are python coordinates.
  71         '''
  72         ti_lazy_index_load( self.tabixfile )
  73
  74         cdef int rtid
  75         cdef int rstart
  76         cdef int rend
  77         cdef int max_pos
  78         max_pos = 2 << 29
  79
  80         rtid = rstart = rend = 0
  81
  82         # translate to a region
  83         if reference:
  84             if start != None and end != None:
  85                 region = "%s:%i-%i" % (reference, start+1, end)
  86             elif start == None and end != None:
  87                 region = "%s:%i-%i" % (reference, 1, end)
  88             elif end == None and start != None:
  89                 region = "%s:%i-%i" % (reference, start+1, max_pos-1)
  90             else:
  91                 region = reference
  92
  93         if region:
  94             ti_parse_region( self.tabixfile.idx, region, &rtid, &rstart, &rend)
  95             if rtid < 0: raise ValueError( "invalid region `%s`" % region )
  96             if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
  97             if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
  98             if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend )
  99
 100         return region, rtid, rstart, rend
 101
 102     def fetch( self,
 103                reference = None,
 104                start = None,
 105                end = None,
 106                region = None,
 107                parser = None ):
 108         '''
 109
 110         fetch one or more rows in a :term:`region` using 0-based indexing. The region is specified by
 111         :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
 112
 113         Without *reference* or *region* all entries will be fetched.
 114
 115         If only *reference* is set, all reads matching on *reference* will be fetched.
 116
 117         If *parser* is None, the results are returned as an unparsed string.
 118         Otherwise, *parser* is assumed to be a functor that will return parsed
 119         data (see for example :meth:`asTuple` and :meth:`asGTF`).
 120         '''
 121         ti_lazy_index_load( self.tabixfile )
 122
 123         if not self._isOpen():
 124             raise ValueError( "I/O operation on closed file" )
 125
 126         region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
 127
 128         if parser == None:
 129             if region:
 130                 return TabixIterator( self, rtid, rstart, rend )
 131             else:
 132                 return TabixIterator( self, -1, 0, 0 )
 133         else:
 134             if region:
 135                 return TabixIteratorParsed( self, rtid, rstart, rend, parser )
 136             else:
 137                 return TabixIteratorParsed( self, -1, 0, 0, parser )
 138
 139     property header:
 140         def __get__( self ):
 141             '''return header lines as an iterator.
 142
 143             Note that the header lines do not contain the newline '\n' character.
 144             '''
 145             return TabixHeaderIterator( self )
 146
 147     property contigs:
 148         '''chromosome names'''
 149         def __get__(self):
 150             cdef char ** sequences
 151             cdef int nsequences
 152
 153             ti_lazy_index_load( self.tabixfile )
 154             sequences = ti_seqname( self.tabixfile.idx, &nsequences )
 155             cdef int x
 156             result = []
 157             for x from 0 <= x < nsequences:
 158                 result.append( sequences[x] )
 159             return result
 160
 161 cdef class TabixIterator:
 162     """iterates over rows in *tabixfile* in region
 163     given by *tid*, *start* and *end*.
 164     """
 165
 166     cdef ti_iter_t iterator
 167     cdef tabix_t * tabixfile
 168
 169     def __cinit__(self, Tabixfile tabixfile,
 170                   int tid, int start, int end ):
 171
 172         assert tabixfile._isOpen()
 173
 174         # makes sure that samfile stays alive as long as the
 175         # iterator is alive.
 176         self.tabixfile = tabixfile.tabixfile
 177
 178         if tid < 0:
 179             # seek to start of file to ensure iteration is over
 180             # all entries.
 181             bgzf_seek( self.tabixfile.fp, 0, 0)
 182             self.iterator = ti_iter_first()
 183         else:
 184             self.iterator = ti_queryi(self.tabixfile, tid, start, end)
 185
 186         if <void*>self.iterator == NULL:
 187             raise ValueError("malformatted query or wrong sequence name.\n")
 188
 189     def __iter__(self):
 190         return self
 191
 192     def __next__(self):
 193         """python version of next().
 194
 195         pyrex uses this non-standard name instead of next()
 196         """
 197
 198         cdef char * s
 199         cdef int len
 200         # metachar filtering does not work within tabix
 201         # though it should. Getting the metachar is a pain
 202         # as ti_index_t is incomplete type.
 203
 204         # simply use '#' for now.
 205         while 1:
 206             s = ti_read(self.tabixfile, self.iterator, &len)
 207             if s == NULL: raise StopIteration
 208             if s[0] != '#': break
 209
 210         return s
 211
 212     def __dealloc__(self):
 213         if <void*>self.iterator != NULL:
 214             ti_iter_destroy(self.iterator)
 215
 216 cdef class TabixHeaderIterator:
 217     """return header lines.
 218     """
 219
 220     cdef ti_iter_t iterator
 221     cdef tabix_t * tabixfile
 222
 223     def __cinit__(self, Tabixfile tabixfile ):
 224
 225         assert tabixfile._isOpen()
 226
 227         # makes sure that samfile stays alive as long as the
 228         # iterator is alive.
 229         self.tabixfile = tabixfile.tabixfile
 230
 231         self.iterator = ti_query(self.tabixfile, NULL, 0, 0)
 232
 233         if <void*>self.iterator == NULL:
 234             raise ValueError("can't open header.\n")
 235
 236     def __iter__(self):
 237         return self
 238
 239     def __next__(self):
 240         """python version of next().
 241
 242         pyrex uses this non-standard name instead of next()
 243         """
 244
 245         cdef char * s
 246         cdef int len
 247
 248         # Getting the metachar is a pain as ti_index_t is incomplete type.
 249         # simply use '#' for now.
 250         s = ti_read(self.tabixfile, self.iterator, &len)
 251         if s == NULL: raise StopIteration
 252         # stop at first non-header line
 253         if s[0] != '#': raise StopIteration
 254
 255         return s
 256
 257     def __dealloc__(self):
 258         if <void*>self.iterator != NULL:
 259             ti_iter_destroy(self.iterator)
 260
 261 #########################################################
 262 #########################################################
 263 #########################################################
 264 cdef class Parser:
 265     pass
 266
 267 cdef class asTuple(Parser):
 268     '''converts a :term:`tabix row` into a python tuple.'''
 269     def __call__(self, char * buffer, int len):
 270         cdef TabProxies.TupleProxy r
 271         r = TabProxies.TupleProxy()
 272         # need to copy - there were some
 273         # persistence issues with "present"
 274         r.copy( buffer, len )
 275         return r
 276
 277 cdef class asGTF(Parser):
 278     '''converts a :term:`tabix row` into a GTF record.'''
 279     def __call__(self, char * buffer, int len):
 280         cdef TabProxies.GTFProxy r
 281         r = TabProxies.GTFProxy()
 282         r.copy( buffer, len )
 283         return r
 284
 285 cdef class asBed( Parser ):
 286     '''converts a :term:`tabix row` into a GTF record.'''
 287     def __call__(self, char * buffer, int len):
 288         cdef TabProxies.BedProxy r
 289         r = TabProxies.BedProxy()
 290         r.copy( buffer, len )
 291         return r
 292
 293 cdef class asVCF( Parser ):
 294     '''converts a :term:`tabix row` into a VCF record.'''
 295     def __call__(self, char * buffer, int len ):
 296         cdef TabProxies.VCFProxy r
 297         r = TabProxies.VCFProxy()
 298         r.copy( buffer, len )
 299         return r
 300
 301 #########################################################
 302 #########################################################
 303 #########################################################
 304 cdef class TabixIteratorParsed:
 305     """iterates over mapped reads in a region.
 306
 307     Returns parsed data.
 308     """
 309
 310     cdef ti_iter_t iterator
 311     cdef tabix_t * tabixfile
 312     cdef Parser parser
 313
 314     def __cinit__(self,
 315                   Tabixfile tabixfile,
 316                   int tid,
 317                   int start,
 318                   int end,
 319                   Parser parser ):
 320
 321         assert tabixfile._isOpen()
 322         self.parser = parser
 323
 324         # makes sure that samfile stays alive as long as the
 325         # iterator is alive.
 326         self.tabixfile = tabixfile.tabixfile
 327
 328         if tid < 0:
 329             # seek to start of file to ensure iteration is over
 330             # all entries.
 331             bgzf_seek( self.tabixfile.fp, 0, 0)
 332             self.iterator = ti_iter_first()
 333         else:
 334             self.iterator = ti_queryi(self.tabixfile, tid, start, end)
 335
 336         if <void*>self.iterator == NULL:
 337             raise ValueError("malformatted query or wrong sequence name.\n")
 338
 339     def __iter__(self):
 340         return self
 341
 342     def __next__(self):
 343         """python version of next().
 344
 345         pyrex uses this non-standard name instead of next()
 346         """
 347
 348         cdef char * s
 349         cdef int len
 350         while 1:
 351             s = ti_read(self.tabixfile, self.iterator, &len)
 352             if s == NULL: raise StopIteration
 353             # todo: read metachar from configuration
 354             if s[0] != '#': break
 355
 356         return self.parser(s, len)
 357
 358     def __dealloc__(self):
 359         if <void*>self.iterator != NULL:
 360             ti_iter_destroy(self.iterator)
 361
 362 def tabix_compress( filename_in,
 363               filename_out,
 364               force = False ):
 365
 366     '''
 367     compress *filename_in* writing the output to *filename_out*.
 368
 369     Raise an IOError if *filename_out* already exists, unless *force* is set.
 370     '''
 371
 372     if not force and os.path.exists(filename_out ):
 373         raise IOError( "Filename '%s' already exists, use *force* to overwrite" % filename_out)
 374
 375     cdef int WINDOW_SIZE
 376     cdef int c, r
 377     cdef void * buffer
 378     cdef BGZF * fp
 379     cdef int fd_src
 380
 381     cdef int O_RDONLY
 382     O_RDONLY = os.O_RDONLY
 383
 384     WINDOW_SIZE = 64 * 1024
 385
 386     fp = bgzf_open( filename_out, "w")
 387     if fp == NULL:
 388         raise IOError( "could not open '%s' for writing" )
 389
 390     fd_src = open(filename_in, O_RDONLY)
 391     if fd_src == 0:
 392         raise IOError( "could not open '%s' for reading" )
 393
 394     buffer = malloc(WINDOW_SIZE)
 395
 396     while c > 0:
 397         c = read(fd_src, buffer, WINDOW_SIZE)
 398         r = bgzf_write(fp, buffer, c)
 399         if r < 0:
 400             free( buffer )
 401             raise OSError("writing failed")
 402
 403     free( buffer )
 404     r = bgzf_close(fp)
 405     if r < 0: raise OSError("writing failed")
 406
 407 def tabix_index( filename,
 408                  force = False,
 409                  seq_col = None,
 410                  start_col = None,
 411                  end_col = None,
 412                  preset = None,
 413                  meta_char = "#",
 414                  zerobased = False,
 415                 ):
 416     '''
 417     index tab-separated *filename* using tabix.
 418
 419     An existing index will not be overwritten unless
 420     *force* is set.
 421
 422     The index will be built from coordinates
 423     in columns *seq_col*, *start_col* and *end_col*.
 424
 425     The contents of *filename* have to be sorted by
 426     contig and position - the method does not check
 427     if the file is sorted.
 428
 429     Column indices are 0-based. Coordinates in the file
 430     are assumed to be 1-based.
 431
 432     If *preset* is provided, the column coordinates
 433     are taken from a preset. Valid values for preset
 434     are "gff", "bed", "sam", "vcf", psltbl", "pileup".
 435
 436     Lines beginning with *meta_char* and the first
 437     *line_skip* lines will be skipped.
 438
 439     If *filename* does not end in ".gz", it will be automatically
 440     compressed. The original file will be removed and only the
 441     compressed file will be retained.
 442
 443     If *filename* ends in *gz*, the file is assumed to be already
 444     compressed with bgzf.
 445
 446     returns the filename of the compressed data
 447     '''
 448
 449     if not os.path.exists(filename): raise IOError("No such file '%s'" % filename)
 450
 451     if not filename.endswith(".gz"):
 452
 453         tabix_compress( filename, filename + ".gz", force = force )
 454         os.unlink( filename )
 455         filename += ".gz"
 456
 457     if not force and os.path.exists(filename + ".tbi" ):
 458         raise IOError( "Filename '%s.tbi' already exists, use *force* to overwrite" )
 459
 460     # columns (1-based)
 461     # preset-code, contig, start, end, metachar for commends, lines to ignore at beginning
 462     # 0 is a missing column
 463     preset2conf = {
 464         'gff' : ( 0, 1, 4, 5, ord('#'), 0 ),
 465         'bed' : ( 0x10000, 1, 2, 3, ord('#'), 0 ),
 466         'psltbl' : ( 0x10000, 15, 17, 18, ord('#'), 0 ),
 467         'sam' : ( 1, 3, 4, 0, ord('#'), 0 ),
 468         'vcf' : ( 2, 1, 2, 0, ord('#'), 0 ),
 469         'pileup': (3, 1, 2, 0, ord('#'), 0 ),
 470         }
 471
 472     if preset:
 473         try:
 474             conf_data = preset2conf[preset]
 475         except KeyError:
 476             raise KeyError( "unknown preset '%s', valid presets are '%s'" % (preset, ",".join(preset2conf.keys() )))
 477     else:
 478         if end_col == None: end_col = -1
 479         preset = 0
 480
 481         # note that tabix internally works with 0-based coordinates and open/closed intervals.
 482         # When using a preset, conversion is automatically taken care of.
 483         # Otherwise, the coordinates are assumed to be 1-based closed intervals and
 484         # -1 is subtracted from the start coordinate. To avoid doing this, set
 485         # the TI_FLAG_UCSC=0x10000 flag:
 486         if zerobased: preset = preset | 0x10000
 487
 488         conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0)
 489
 490     cdef ti_conf_t conf
 491     conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
 492
 493     ti_index_build( filename, &conf)
 494
 495     return filename
 496
 497 __all__ = ["tabix_index",
 498            "tabix_compress",
 499            "Tabixfile",
 500            "asTuple",
 501            "asGTF",
 502            "asVCF",
 503            "asBed",
 504            ]