pysam/ctabix.pyx

   1 # cython: embedsignature=True
   2 # adds doc-strings for sphinx
   3
   4 import tempfile, os, sys, types, itertools, struct, ctypes
   5
   6 cdef class Tabixfile:
   7     '''*(filename, mode='r')*
   8
   9     opens a :term:`tabix file` for reading. A missing
  10     index (*filename* + ".tbi") will raise an exception.
  11     '''
  12
  13     cdef char * filename
  14
  15     # pointer to tabixfile
  16     cdef tabix_t * tabixfile
  17
  18     def __cinit__(self, *args, **kwargs ):
  19         self.tabixfile = NULL
  20         self._open( *args, **kwargs )
  21
  22     def _isOpen( self ):
  23         '''return true if samfile has been opened.'''
  24         return self.tabixfile != NULL
  25
  26     def _open( self,
  27                char * filename,
  28                mode ='r',
  29               ):
  30         '''open a :term:`tabix file` for reading.
  31         '''
  32
  33         assert mode in ( "r",), "invalid file opening mode `%s`" % mode
  34
  35         # close a previously opened file
  36         if self.tabixfile != NULL: self.close()
  37         self.tabixfile = NULL
  38
  39         self.filename = filename
  40         filename_index = filename + ".tbi"
  41
  42         if mode[0] == 'w':
  43             # open file for writing
  44             pass
  45
  46         elif mode[0] == "r":
  47             # open file for reading
  48             if not os.path.exists( self.filename ):
  49                 raise IOError( "file `%s` not found" % self.filename)
  50
  51             if not os.path.exists( filename_index ):
  52                 raise IOError( "index `%s` not found" % filename_index)
  53
  54             # open file and load index
  55             self.tabixfile = ti_open( self.filename, filename_index )
  56
  57         if self.tabixfile == NULL:
  58             raise IOError("could not open file `%s`" % filename )
  59
  60     def _parseRegion( self,
  61                       reference = None,
  62                       start = None,
  63                       end = None,
  64                       region = None ):
  65         '''parse region information.
  66
  67         raise ValueError for for invalid regions.
  68
  69         returns a tuple of region, tid, start and end. Region
  70         is a valid samtools :term:`region` or None if the region
  71         extends over the whole file.
  72
  73         Note that regions are 1-based, while start,end are python coordinates.
  74         '''
  75         ti_lazy_index_load( self.tabixfile )
  76
  77         cdef int rtid
  78         cdef int rstart
  79         cdef int rend
  80         cdef int max_pos
  81         max_pos = 2 << 29
  82
  83         rtid = rstart = rend = 0
  84
  85         # translate to a region
  86         if reference:
  87             if start != None and end != None:
  88                 region = "%s:%i-%i" % (reference, start+1, end)
  89             elif start == None and end != None:
  90                 region = "%s:%i-%i" % (reference, 1, end)
  91             elif end == None and start != None:
  92                 region = "%s:%i-%i" % (reference, start+1, max_pos-1)
  93             else:
  94                 region = reference
  95
  96         if region:
  97             ti_parse_region( self.tabixfile.idx, region, &rtid, &rstart, &rend)
  98             if rtid < 0: raise ValueError( "invalid region `%s`" % region )
  99             if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
 100             if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
 101             if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend )
 102
 103         return region, rtid, rstart, rend
 104
 105     def fetch( self,
 106                reference = None,
 107                start = None,
 108                end = None,
 109                region = None,
 110                parser = None ):
 111         '''
 112
 113         fetch one or more rows in a :term:`region` using 0-based indexing. The region is specified by
 114         :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
 115
 116         Without *reference* or *region* all entries will be fetched.
 117
 118         If only *reference* is set, all reads matching on *reference* will be fetched.
 119
 120         If *parser* is None, the results are returned as an unparsed string.
 121         Otherwise, *parser* is assumed to be a functor that will return parsed
 122         data (see for example :meth:`asTuple` and :meth:`asGTF`).
 123         '''
 124         ti_lazy_index_load( self.tabixfile )
 125
 126         if not self._isOpen():
 127             raise ValueError( "I/O operation on closed file" )
 128
 129         region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
 130
 131         if parser == None:
 132             if region:
 133                 return TabixIterator( self, rtid, rstart, rend )
 134             else:
 135                 return TabixIterator( self, -1, 0, 0 )
 136         else:
 137             if region:
 138                 return TabixIteratorParsed( self, rtid, rstart, rend, parser )
 139             else:
 140                 return TabixIteratorParsed( self, -1, 0, 0, parser )
 141
 142     property contigs:
 143        '''chromosome names'''
 144        def __get__(self):
 145            cdef char ** sequences
 146            cdef int nsequences
 147
 148            ti_lazy_index_load( self.tabixfile )
 149            sequences = ti_seqname( self.tabixfile.idx, &nsequences )
 150            cdef int x
 151            result = []
 152            for x from 0 <= x < nsequences:
 153                result.append( sequences[x] )
 154            return result
 155
 156 cdef class TabixIterator:
 157     """iterates over rows in *tabixfile* in region
 158     given by *tid*, *start* and *end*.
 159     """
 160
 161     cdef ti_iter_t iterator
 162     cdef tabix_t * tabixfile
 163
 164     def __cinit__(self, Tabixfile tabixfile,
 165                   int tid, int start, int end ):
 166
 167         assert tabixfile._isOpen()
 168
 169         # makes sure that samfile stays alive as long as the
 170         # iterator is alive.
 171         self.tabixfile = tabixfile.tabixfile
 172
 173         if tid < 0:
 174             # seek to start of file to ensure iteration is over
 175             # all entries.
 176             bgzf_seek( self.tabixfile.fp, 0, 0)
 177             self.iterator = ti_iter_first()
 178         else:
 179             self.iterator = ti_queryi(self.tabixfile, tid, start, end)
 180
 181         if <void*>self.iterator == NULL:
 182             raise ValueError("malformatted query or wrong sequence name.\n")
 183
 184     def __iter__(self):
 185         return self
 186
 187     def __next__(self):
 188         """python version of next().
 189
 190         pyrex uses this non-standard name instead of next()
 191         """
 192
 193         cdef char * s
 194         cdef int len
 195         s = ti_read(self.tabixfile, self.iterator, &len)
 196         if s == NULL: raise StopIteration
 197         return s
 198
 199     def __dealloc__(self):
 200         if <void*>self.iterator != NULL:
 201             ti_iter_destroy(self.iterator)
 202
 203 def toDot( v ):
 204     '''convert value to '.' if None'''
 205     if v == None: return "."
 206     else: return str(v)
 207
 208 def quote( v ):
 209     '''return a quoted attribute.'''
 210     if type(v) in types.StringTypes:
 211         return '"%s"' % v
 212     else:
 213         return str(v)
 214
 215 cdef class TupleProxy:
 216     '''Proxy class for access to parsed row as a tuple.
 217
 218     This class represents a table row for fast read-access.
 219     '''
 220
 221     cdef:
 222         char * data
 223         char ** fields
 224         int nfields
 225         int index
 226
 227     def __cinit__(self ):
 228
 229         self.data = NULL
 230         self.fields = NULL
 231         self.index = 0
 232
 233     cdef take( self, char * buffer, size_t nbytes ):
 234         '''start presenting buffer.
 235
 236         Take ownership of the pointer.
 237         '''
 238         self.data = buffer
 239         self.update( buffer, nbytes )
 240
 241     cdef present( self, char * buffer, size_t nbytes ):
 242         '''start presenting buffer.
 243
 244         Do not take ownership of the pointer.
 245         '''
 246         self.update( buffer, nbytes )
 247
 248     cdef copy( self, char * buffer, size_t nbytes ):
 249         '''start presenting buffer.
 250
 251         Take a copy of buffer.
 252         '''
 253         cdef int s
 254         # +1 for '\0'
 255         s = sizeof(char) *  (nbytes + 1)
 256         self.data = <char*>malloc( s )
 257         memcpy( <char*>self.data, buffer, s )
 258         self.update( self.data, nbytes )
 259
 260     cdef update( self, char * buffer, size_t nbytes ):
 261         '''update internal data.'''
 262         cdef char * pos
 263         cdef char * old_pos
 264         cdef int field
 265         cdef int max_fields
 266         field = 0
 267
 268         if buffer[nbytes] != 0:
 269             raise ValueError( "incomplete line at %s" % buffer )
 270
 271         if self.fields != NULL:
 272             free(self.fields)
 273
 274         max_fields = nbytes / 4
 275         self.fields = <char **>calloc( max_fields, sizeof(char *) )
 276
 277         pos = buffer
 278         self.fields[0] = pos
 279         field += 1
 280         old_pos = pos
 281
 282         while 1:
 283
 284             pos = <char*>memchr( pos, '\t', nbytes )
 285             if pos == NULL: break
 286             pos[0] = '\0'
 287             pos += 1
 288             self.fields[field] = pos
 289             field += 1
 290             if field >= max_fields:
 291                 raise ValueError("row too large - more than %i fields" % max_fields )
 292             nbytes -= pos - old_pos
 293             if nbytes < 0: break
 294             old_pos = pos
 295
 296         self.nfields = field
 297
 298     def __getitem__( self, key ):
 299
 300         cdef int i
 301         i = key
 302         if i < 0: i += self.nfields
 303         if i >= self.nfields or i < 0:
 304             raise IndexError( "list index out of range" )
 305         return self.fields[i]
 306
 307     def __len__(self):
 308         return self.nfields
 309
 310     def __dealloc__(self):
 311         if self.data != NULL:
 312             free(self.data)
 313
 314     def __iter__(self):
 315         self.index = 0
 316         return self
 317
 318     def __next__(self):
 319         """python version of next().
 320         """
 321         if self.index >= self.nfields:
 322             raise StopIteration
 323         self.index += 1
 324         return self.fields[self.index-1]
 325
 326 cdef class GTFProxy:
 327     '''Proxy class for access to GTF fields.
 328
 329     This class represents a GTF entry for fast read-access.
 330     Write-access has been added as well, though some care must
 331     be taken. If any of the string fields (contig, source, ...)
 332     are set, the new value is tied to the lifetime of the
 333     argument that was supplied.
 334
 335     The only exception is the attributes field when set from
 336     a dictionary - this field will manage its own memory.
 337
 338     '''
 339
 340     cdef:
 341         char * contig
 342         char * source
 343         char * feature
 344         uint32_t start
 345         uint32_t end
 346         char * score
 347         char * strand
 348         char * frame
 349         char * attributes
 350         int nbytes
 351         char * data
 352         cdef bint isModified
 353         cdef bint hasOwnAttributes
 354
 355     def __cinit__(self ):
 356         self.data = NULL
 357         self.isModified = False
 358         self.hasOwnAttributes = False
 359
 360     cdef take( self, char * buffer, size_t nbytes ):
 361         '''start presenting buffer.
 362
 363         Take ownership of the pointer.
 364         '''
 365         self.data = buffer
 366         self.update( buffer, nbytes )
 367         self.isModified = False
 368
 369     cdef present( self, char * buffer, size_t nbytes ):
 370         '''start presenting buffer.
 371
 372         Do not take ownership of the pointer.
 373         '''
 374         self.update( buffer, nbytes )
 375         self.isModified = False
 376
 377     cdef copy( self, char * buffer, size_t nbytes ):
 378         '''start presenting buffer.
 379
 380         Take a copy of buffer.
 381         '''
 382         cdef int s
 383         # +1 for '\0'
 384         s = sizeof(char) *  (nbytes + 1)
 385         self.data = <char*>malloc( s )
 386         memcpy( <char*>self.data, buffer, s )
 387         self.update( self.data, nbytes )
 388         self.isModified = False
 389
 390     cdef update( self, char * buffer, size_t nbytes ):
 391         '''update internal data.
 392
 393         nbytes does not include the terminal '\0'.
 394         '''
 395         cdef int end
 396         cdef char * cstart, * cend, * cscore
 397         self.contig = buffer
 398         self.nbytes = nbytes
 399         cdef char * pos
 400
 401         if buffer[nbytes] != 0:
 402             raise ValueError( "incomplete line at %s" % buffer )
 403
 404         pos = strchr( buffer, '\t' )
 405         if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
 406         pos[0] = '\0'
 407         pos += 1
 408         self.source = pos
 409
 410         pos = strchr( pos, '\t' )
 411         if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
 412         pos[0] = '\0'
 413         pos += 1
 414         self.feature = pos
 415
 416         pos = strchr( pos, '\t' )
 417         if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
 418         pos[0] = '\0'
 419         pos += 1
 420         cstart = pos
 421
 422         pos = strchr( pos, '\t' )
 423         if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
 424         pos[0] = '\0'
 425         pos += 1
 426         cend = pos
 427
 428         pos = strchr( pos, '\t' )
 429         if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
 430         pos[0] = '\0'
 431         pos += 1
 432         self.score = pos
 433
 434         pos = strchr( pos, '\t' )
 435         if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
 436         pos[0] = '\0'
 437         pos += 1
 438         self.strand = pos
 439
 440         pos = strchr( pos, '\t' )
 441         if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
 442         pos[0] = '\0'
 443         pos += 1
 444         self.frame = pos
 445
 446         pos = strchr( pos, '\t' )
 447         if pos == NULL: raise ValueError( "malformatted entry at %s" % buffer )
 448         pos[0] = '\0'
 449         pos += 1
 450         self.attributes = pos
 451         self.start = atoi( cstart ) - 1
 452         self.end = atoi( cend )
 453
 454     property contig:
 455        '''contig of feature.'''
 456        def __get__( self ): return self.contig
 457        def __set__( self, value ):
 458            self.isModified = True
 459            self.contig = value
 460
 461     property feature:
 462        '''feature name.'''
 463        def __get__( self ): return self.feature
 464        def __set__( self, value ):
 465            self.isModified = True
 466            self.feature = value
 467
 468     property source:
 469        '''feature source.'''
 470        def __get__( self ): return self.source
 471        def __set__( self, value ):
 472            self.isModified = True
 473            self.source = value
 474
 475     property start:
 476        '''feature start (in 0-based open/closed coordinates).'''
 477        def __get__( self ): return self.start
 478        def __set__( self, value ):
 479            self.isModified = True
 480            self.start = value
 481
 482     property end:
 483        '''feature end (in 0-based open/closed coordinates).'''
 484        def __get__( self ): return self.end
 485        def __set__( self, value ):
 486            self.isModified = True
 487            self.end = value
 488
 489     property score:
 490        '''feature score.'''
 491        def __get__( self ):
 492            if self.score[0] == '.' and self.score[1] == '\0' :
 493                return None
 494            else:
 495                return atof(self.score)
 496        def __set__( self, value ):
 497            self.isModified = True
 498            self.score = value
 499
 500     property strand:
 501        '''feature strand.'''
 502        def __get__( self ): return self.strand
 503        def __set__( self, value ):
 504            self.isModified = True
 505            self.strand = value
 506
 507     property frame:
 508        '''feature frame.'''
 509        def __get__( self ): return self.frame
 510        def __set__( self, value ):
 511            self.isModified = True
 512            self.frame = value
 513
 514     property attributes:
 515        '''feature attributes (as a string).'''
 516        def __get__( self ): return self.attributes
 517        def __set__( self, value ):
 518            self.isModified = True
 519            self.attributes = value
 520
 521     def asDict( self ):
 522         """parse attributes - return as dict
 523         """
 524
 525         # remove comments
 526         attributes = self.attributes
 527
 528         # separate into fields
 529         fields = [ x.strip() for x in attributes.split(";")[:-1]]
 530
 531         result = {}
 532
 533         for f in fields:
 534
 535             d = [ x.strip() for x in f.split(" ")]
 536
 537             n,v = d[0], d[1]
 538             if len(d) > 2: v = d[1:]
 539
 540             if v[0] == '"' and v[-1] == '"':
 541                 v = v[1:-1]
 542             else:
 543                 ## try to convert to a value
 544                 try:
 545                     v = float( v )
 546                     v = int( v )
 547                 except ValueError:
 548                     pass
 549                 except TypeError:
 550                     pass
 551
 552             result[n] = v
 553
 554         return result
 555
 556     def fromDict( self, d ):
 557         '''set attributes from a dictionary.'''
 558         cdef char * p
 559         cdef int l
 560
 561         # clean up if this field is set twice
 562         if self.hasOwnAttributes:
 563             free(self.attributes)
 564
 565         aa = []
 566         for k,v in d.items():
 567             if type(v) == types.StringType:
 568                 aa.append( '%s "%s"' % (k,v) )
 569             else:
 570                 aa.append( '%s %s' % (k,str(v)) )
 571
 572         a = "; ".join( aa ) + ";"
 573         p = a
 574         l = len(a)
 575         self.attributes = <char *>calloc( l + 1, sizeof(char) )
 576         memcpy( self.attributes, p, l )
 577
 578         self.hasOwnAttributes = True
 579         self.isModified = True
 580
 581     def __str__(self):
 582         cdef char * cpy
 583         cdef int x
 584
 585         if self.isModified:
 586             return "\t".join(
 587                 (self.contig,
 588                  self.source,
 589                  self.feature,
 590                  str(self.start+1),
 591                  str(self.end),
 592                  toDot(self.score),
 593                  self.strand,
 594                  self.frame,
 595                  self.attributes ) )
 596         else:
 597             cpy = <char*>calloc( sizeof(char), self.nbytes+1 )
 598             memcpy( cpy, self.data, self.nbytes+1)
 599             for x from 0 <= x < self.nbytes:
 600                 if cpy[x] == '\0': cpy[x] = '\t'
 601             result = cpy
 602             free(cpy)
 603             return result
 604
 605     def invert( self, int lcontig ):
 606         '''invert coordinates to negative strand coordinates
 607
 608         This method will only act if the feature is on the
 609         negative strand.'''
 610
 611         if self.strand[0] == '-':
 612             start = min(self.start, self.end)
 613             end = max(self.start, self.end)
 614             self.start, self.end = lcontig - end, lcontig - start
 615
 616     def keys( self ):
 617         '''return a list of attributes defined in this entry.'''
 618         r = self.attributes
 619         return [ x.strip().split(" ")[0] for x in r.split(";") if x.strip() != '' ]
 620
 621     def __getitem__(self, item):
 622         return self.__getattr__( item )
 623
 624     def __dealloc__(self):
 625         if self.data != NULL:
 626             free(self.data)
 627         if self.hasOwnAttributes:
 628             free(self.attributes)
 629
 630     def __getattr__(self, item ):
 631         """Generic lookup of attribute from GFF/GTF attributes
 632         Only called if there *isn't* an attribute with this name
 633         """
 634         cdef char * start
 635         cdef char * query
 636         cdef char * cpy
 637         cdef char * end
 638         cdef int l
 639         query = item
 640
 641         start = strstr( self.attributes, query)
 642         if start == NULL:
 643             raise AttributeError("'GTFProxy' has no attribute '%s'" % item )
 644
 645         start += strlen(query) + 1
 646         # skip gaps before
 647         while start[0] == " ": start += 1
 648         if start[0] == '"':
 649             start += 1
 650             end = start
 651             while end[0] != '\0' and end[0] != '"': end += 1
 652             l = end - start + 1
 653             cpy = <char*>calloc( l, sizeof(char ) )
 654             memcpy( cpy, start, l )
 655             cpy[l-1] = '\0'
 656             result = cpy
 657             free(cpy)
 658             return result
 659         else:
 660             return start
 661
 662     def setAttribute( self, name, value ):
 663         '''convenience method to set an attribute.'''
 664         r = self.asDict()
 665         r[name] = value
 666         self.fromDict( r )
 667
 668 cdef class Parser:
 669     pass
 670
 671 cdef class asTuple(Parser):
 672     '''converts a :term:`tabix row` into a python tuple.'''
 673     def __call__(self, char * buffer, int len):
 674         cdef TupleProxy r
 675         r = TupleProxy()
 676         # need to copy - there were some
 677         # persistence issues with "present"
 678         r.copy( buffer, len )
 679         return r
 680
 681 cdef class asGTF(Parser):
 682     '''converts a :term:`tabix row` into a GTF record.'''
 683     def __call__(self, char * buffer, int len):
 684         cdef GTFProxy r
 685         r = GTFProxy()
 686         r.copy( buffer, len )
 687         return r
 688
 689 cdef class TabixIteratorParsed:
 690     """iterates over mapped reads in a region.
 691     """
 692
 693     cdef ti_iter_t iterator
 694     cdef tabix_t * tabixfile
 695     cdef Parser parser
 696
 697     def __cinit__(self,
 698                   Tabixfile tabixfile,
 699                   int tid,
 700                   int start,
 701                   int end,
 702                   Parser parser ):
 703
 704         assert tabixfile._isOpen()
 705         self.parser = parser
 706
 707         # makes sure that samfile stays alive as long as the
 708         # iterator is alive.
 709         self.tabixfile = tabixfile.tabixfile
 710
 711         if tid < 0:
 712             # seek to start of file to ensure iteration is over
 713             # all entries.
 714             bgzf_seek( self.tabixfile.fp, 0, 0)
 715             self.iterator = ti_iter_first()
 716         else:
 717             self.iterator = ti_queryi(self.tabixfile, tid, start, end)
 718
 719         if <void*>self.iterator == NULL:
 720             raise ValueError("malformatted query or wrong sequence name.\n")
 721
 722     def __iter__(self):
 723         return self
 724
 725     def __next__(self):
 726         """python version of next().
 727
 728         pyrex uses this non-standard name instead of next()
 729         """
 730
 731         cdef char * s
 732         cdef int len
 733         s = ti_read(self.tabixfile, self.iterator, &len)
 734         if s == NULL: raise StopIteration
 735         return self.parser(s, len)
 736
 737     def __dealloc__(self):
 738         if <void*>self.iterator != NULL:
 739             ti_iter_destroy(self.iterator)
 740
 741 def tabix_compress( filename_in,
 742               filename_out,
 743               force = False ):
 744
 745     '''
 746     compress *filename_in* writing the output to *filename_out*.
 747
 748     Raise an IOError if *filename_out* already exists, unless *force* is set.
 749     '''
 750
 751     if not force and os.path.exists(filename_out ):
 752         raise IOError( "Filename '%s' already exists, use *force* to overwrite" % filename_out)
 753
 754     cdef int WINDOW_SIZE
 755     cdef int c, r
 756     cdef void * buffer
 757     cdef BGZF * fp
 758     cdef int fd_src
 759
 760     cdef int O_RDONLY
 761     O_RDONLY = os.O_RDONLY
 762
 763     WINDOW_SIZE = 64 * 1024
 764
 765     fp = bgzf_open( filename_out, "w")
 766     if fp == NULL:
 767         raise IOError( "could not open '%s' for writing" )
 768
 769     fd_src = open(filename_in, O_RDONLY)
 770     if fd_src == 0:
 771         raise IOError( "could not open '%s' for reading" )
 772
 773     buffer = malloc(WINDOW_SIZE)
 774
 775     while c > 0:
 776         c = read(fd_src, buffer, WINDOW_SIZE)
 777         r = bgzf_write(fp, buffer, c)
 778         if r < 0:
 779             free( buffer )
 780             raise OSError("writing failed")
 781
 782     free( buffer )
 783     r = bgzf_close(fp)
 784     if r < 0: raise OSError("writing failed")
 785
 786 def tabix_index( filename,
 787                  force = False,
 788                  seq_col = None,
 789                  start_col = None,
 790                  end_col = None,
 791                  preset = None,
 792                  meta_char = "#",
 793                  zerobased = False,
 794                 ):
 795     '''
 796     index tab-separated *filename* using tabix.
 797
 798     An existing index will not be overwritten unless
 799     *force* is set.
 800
 801     The index will be built from coordinates
 802     in columns *seq_col*, *start_col* and *end_col*.
 803
 804     The contents of *filename* have to be sorted by
 805     contig and position - the method does not check
 806     if the file is sorted.
 807
 808     Column indices are 0-based. Coordinates in the file
 809     are assumed to be 1-based.
 810
 811     If *preset* is provided, the column coordinates
 812     are taken from a preset. Valid values for preset
 813     are "gff", "bed", "sam", "vcf", psltbl", "pileup".
 814
 815     Lines beginning with *meta_char* and the first
 816     *line_skip* lines will be skipped.
 817
 818     If *filename* does not end in ".gz", it will be automatically
 819     compressed. The original file will be removed and only the
 820     compressed file will be retained.
 821
 822     If *filename* ends in *gz*, the file is assumed to be already
 823     compressed with bgzf.
 824
 825     returns the filename of the compressed data
 826     '''
 827
 828     if not os.path.exists(filename): raise IOError("No such file '%s'" % filename)
 829
 830     if not filename.endswith(".gz"):
 831
 832         tabix_compress( filename, filename + ".gz", force = force )
 833         os.unlink( filename )
 834         filename += ".gz"
 835
 836     if not force and os.path.exists(filename + ".tbi" ):
 837         raise IOError( "Filename '%s.tbi' already exists, use *force* to overwrite" )
 838
 839     # columns (1-based)
 840     # preset-code, contig, start, end, metachar for commends, lines to ignore at beginning
 841     # 0 is a missing column
 842     preset2conf = {
 843         'gff' : ( 0, 1, 4, 5, ord('#'), 0 ),
 844         'bed' : ( 0x10000, 1, 2, 3, ord('#'), 0 ),
 845         'psltbl' : ( 0x10000, 15, 17, 18, ord('#'), 0 ),
 846         'sam' : ( 1, 3, 4, 0, ord('#'), 0 ),
 847         'vcf' : ( 2, 1, 2, 0, ord('#'), 0 ),
 848         'pileup': (3, 1, 2, 0, ord('#'), 0 ),
 849         }
 850
 851     if preset:
 852         try:
 853             conf_data = preset2conf[preset]
 854         except KeyError:
 855             raise KeyError( "unknown preset '%s', valid presets are '%s'" % (preset, ",".join(preset2conf.keys() )))
 856     else:
 857         if end_col == None: end_col = -1
 858         preset = 0
 859
 860         # note that tabix internally works with 0-based coordinates and open/closed intervals.
 861         # When using a preset, conversion is automatically taken care of.
 862         # Otherwise, the coordinates are assumed to be 1-based closed intervals and
 863         # -1 is subtracted from the start coordinate. To avoid doing this, set
 864         # the TI_FLAG_UCSC=0x10000 flag:
 865         if zerobased: preset = preset | 0x10000
 866
 867         conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0)
 868
 869     cdef ti_conf_t conf
 870     conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
 871
 872     ti_index_build( filename, &conf)
 873
 874     return filename
 875
 876 __all__ = ["tabix_index",
 877            "tabix_compress",
 878            "Tabixfile",
 879            "asTuple",
 880            "asGTF",
 881            ]