pysam/ctabix.pyx

   1 # cython: embedsignature=True
   2 # adds doc-strings for sphinx
   3
   4 # Helper functions for python 3 compatibility - taken
   5 # from csamtools.pyx
   6 import tempfile, os, sys, types, itertools, struct, ctypes, gzip
   7 import io
   8 cimport TabProxies
   9
  10 from cpython cimport PyErr_SetString, PyBytes_Check, \
  11     PyUnicode_Check, PyBytes_FromStringAndSize, \
  12     PyObject_AsFileDescriptor
  13
  14 PYTHON3 = PY_MAJOR_VERSION >= 3
  15
  16 # from cpython cimport PyString_FromStringAndSize, PyString_AS_STRING
  17 from cpython.version cimport PY_MAJOR_VERSION
  18
  19 cdef from_string_and_size(char* s, size_t length):
  20     if PY_MAJOR_VERSION < 3:
  21         return s[:length]
  22     else:
  23         return s[:length].decode("ascii")
  24
  25 # filename encoding (copied from lxml.etree.pyx)
  26 cdef str _FILENAME_ENCODING
  27 _FILENAME_ENCODING = sys.getfilesystemencoding()
  28 if _FILENAME_ENCODING is None:
  29     _FILENAME_ENCODING = sys.getdefaultencoding()
  30 if _FILENAME_ENCODING is None:
  31     _FILENAME_ENCODING = 'ascii'
  32
  33 #cdef char* _C_FILENAME_ENCODING
  34 #_C_FILENAME_ENCODING = <char*>_FILENAME_ENCODING
  35
  36 cdef bytes _my_encodeFilename(object filename):
  37     u"""Make sure a filename is 8-bit encoded (or None).
  38     """
  39     if filename is None:
  40         return None
  41     elif PyBytes_Check(filename):
  42         return filename
  43     elif PyUnicode_Check(filename):
  44         return filename.encode(_FILENAME_ENCODING)
  45     else:
  46         raise TypeError, u"Argument must be string or unicode."
  47
  48 cdef bytes _force_bytes(object s):
  49     u"""convert string or unicode object to bytes, assuming ascii encoding.
  50     """
  51     if PY_MAJOR_VERSION < 3:
  52         return s
  53     elif s is None:
  54         return None
  55     elif PyBytes_Check(s):
  56         return s
  57     elif PyUnicode_Check(s):
  58         return s.encode('ascii')
  59     else:
  60         raise TypeError, u"Argument must be string, bytes or unicode."
  61
  62 cdef inline bytes _force_cmdline_bytes(object s):
  63     return _force_bytes(s)
  64
  65 cdef _charptr_to_str(char* s):
  66     if PY_MAJOR_VERSION < 3:
  67         return s
  68     else:
  69         return s.decode("ascii")
  70
  71 cdef _force_str(object s):
  72     """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)"""
  73     if s is None:
  74         return None
  75     if PY_MAJOR_VERSION < 3:
  76         return s
  77     elif PyBytes_Check(s):
  78         return s.decode('ascii')
  79     else:
  80         # assume unicode
  81         return s
  82
  83
  84 cdef class Tabixfile:
  85     '''*(filename, mode='r')*
  86
  87     opens a :term:`tabix file` for reading. A missing
  88     index (*filename* + ".tbi") will raise an exception.
  89     '''
  90     def __cinit__(self, filename, mode = 'r', *args, **kwargs ):
  91         self.tabixfile = NULL
  92         self._open( filename, mode, *args, **kwargs )
  93
  94     def _isOpen( self ):
  95         '''return true if samfile has been opened.'''
  96         return self.tabixfile != NULL
  97
  98     def _open( self,
  99                filename,
 100                mode ='r',
 101               ):
 102         '''open a :term:`tabix file` for reading.
 103         '''
 104
 105         assert mode in ( "r",), "invalid file opening mode `%s`" % mode
 106
 107         # close a previously opened file
 108         if self.tabixfile != NULL: self.close()
 109         self.tabixfile = NULL
 110
 111         filename_index = filename + ".tbi"
 112         self.isremote = filename.startswith( "http:") or filename.startswith( "ftp:" )
 113
 114         # encode all the strings
 115         filename = _my_encodeFilename(filename)
 116         filename_index = _my_encodeFilename(filename_index)
 117         cdef bytes bmode = mode.encode('ascii')
 118
 119         if self._filename != NULL: free(self._filename )
 120
 121         self._filename = strdup(filename)
 122
 123         if mode[0] == 'w':
 124             # open file for writing
 125             raise NotImplementedError("writing to tabix files not implemented" )
 126
 127         elif mode[0] == "r":
 128             # open file for reading
 129
 130             if not self.isremote:
 131                 if not os.path.exists( filename ):
 132                     raise IOError( "file `%s` not found" % filename)
 133
 134                 if not os.path.exists( filename_index ):
 135                     raise IOError( "index `%s` not found" % filename_index)
 136
 137             # open file and load index
 138             self.tabixfile = ti_open( filename, filename_index )
 139
 140         if self.tabixfile == NULL:
 141             raise IOError("could not open file `%s`" % filename )
 142
 143     def _parseRegion( self,
 144                       reference = None,
 145                       start = None,
 146                       end = None,
 147                       region = None ):
 148         '''parse region information.
 149
 150         raise ValueError for for invalid regions.
 151
 152         returns a tuple of region, tid, start and end. Region
 153         is a valid samtools :term:`region` or None if the region
 154         extends over the whole file.
 155
 156         Note that regions are 1-based, while start,end are python coordinates.
 157         '''
 158         ti_lazy_index_load( self.tabixfile )
 159
 160         cdef int rtid
 161         cdef int rstart
 162         cdef int rend
 163         cdef int max_pos
 164         max_pos = 2 << 29
 165
 166         rtid = rstart = rend = 0
 167
 168         # translate to a region
 169         if reference:
 170             if start != None and end != None:
 171                 region = "%s:%i-%i" % (reference, start+1, end)
 172             elif start == None and end != None:
 173                 region = "%s:%i-%i" % (reference, 1, end)
 174             elif end == None and start != None:
 175                 region = "%s:%i-%i" % (reference, start+1, max_pos-1)
 176             else:
 177                 region = reference
 178
 179         if region:
 180             region = _force_bytes(region)
 181             ti_parse_region( self.tabixfile.idx, region,
 182                              &rtid, &rstart, &rend)
 183             if rtid < 0: raise ValueError( "invalid region `%s`" % region )
 184             if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
 185             if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
 186             if not 0 <= rend < max_pos: raise ValueError( 'end out of range (%i)' % rend )
 187
 188         return region, rtid, rstart, rend
 189
 190     def fetch( self,
 191                reference = None,
 192                start = None,
 193                end = None,
 194                region = None,
 195                parser = None ):
 196         '''
 197
 198         fetch one or more rows in a :term:`region` using 0-based indexing. The region is specified by
 199         :term:`reference`, *start* and *end*. Alternatively, a samtools :term:`region` string can be supplied.
 200
 201         Without *reference* or *region* all entries will be fetched.
 202
 203         If only *reference* is set, all reads matching on *reference* will be fetched.
 204
 205         If *parser* is None, the results are returned as an unparsed string.
 206         Otherwise, *parser* is assumed to be a functor that will return parsed
 207         data (see for example :meth:`asTuple` and :meth:`asGTF`).
 208         '''
 209         ti_lazy_index_load( self.tabixfile )
 210
 211         if not self._isOpen():
 212             raise ValueError( "I/O operation on closed file" )
 213
 214         region, rtid, rstart, rend = self._parseRegion( reference, start, end, region )
 215
 216         if parser == None:
 217             if region:
 218                 return TabixIterator( self, rtid, rstart, rend )
 219             else:
 220                 return TabixIterator( self, -1, 0, 0 )
 221         else:
 222             if region:
 223                 return TabixIteratorParsed( self, rtid, rstart, rend, parser )
 224             else:
 225                 return TabixIteratorParsed( self, -1, 0, 0, parser )
 226
 227     ###############################################################
 228     ###############################################################
 229     ###############################################################
 230     ## properties
 231     ###############################################################
 232     property filename:
 233         '''filename associated with this object.'''
 234         def __get__(self):
 235             if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
 236             return self._filename
 237
 238     property header:
 239         '''the file header.
 240
 241         .. note::
 242             The header is returned as an iterator over lines without the
 243             newline character.
 244         '''
 245
 246         def __get__( self ):
 247             return TabixHeaderIterator( self )
 248
 249     property contigs:
 250         '''chromosome names'''
 251         def __get__(self):
 252             cdef char ** sequences
 253             cdef int nsequences
 254
 255             ti_lazy_index_load( self.tabixfile )
 256             sequences = ti_seqname( self.tabixfile.idx, &nsequences )
 257             cdef int x
 258             result = []
 259             for x from 0 <= x < nsequences:
 260                 result.append( sequences[x] )
 261             return result
 262
 263     def close( self ):
 264         '''
 265         closes the :class:`pysam.Tabixfile`.'''
 266         if self.tabixfile != NULL:
 267             ti_close( self.tabixfile )
 268             self.tabixfile = NULL
 269
 270     def __dealloc__( self ):
 271         # remember: dealloc cannot call other python methods
 272         # note: no doc string
 273         # note: __del__ is not called.
 274         if self.tabixfile != NULL:
 275             ti_close( self.tabixfile )
 276             self.tabixfile = NULL
 277         if self._filename != NULL: free( self._filename )
 278
 279 cdef class TabixIterator:
 280     """iterates over rows in *tabixfile* in region
 281     given by *tid*, *start* and *end*.
 282     """
 283
 284     def __cinit__(self, Tabixfile tabixfile,
 285                   int tid, int start, int end ):
 286
 287         assert tabixfile._isOpen()
 288
 289         # makes sure that samfile stays alive as long as the
 290         # iterator is alive.
 291         self.tabixfile = tabixfile.tabixfile
 292
 293         if tid < 0:
 294             # seek to start of file to ensure iteration is over
 295             # all entries.
 296             bgzf_seek( self.tabixfile.fp, 0, 0)
 297             self.iterator = ti_iter_first()
 298         else:
 299             self.iterator = ti_queryi(self.tabixfile, tid, start, end)
 300
 301         if <void*>self.iterator == NULL:
 302             raise ValueError("malformatted query or wrong sequence name.\n")
 303
 304     def __iter__(self):
 305         return self
 306
 307     def __next__(self):
 308         """python version of next().
 309
 310         pyrex uses this non-standard name instead of next()
 311         """
 312
 313         cdef char * s
 314         cdef int len
 315         # metachar filtering does not work within tabix
 316         # though it should. Getting the metachar is a pain
 317         # as ti_index_t is incomplete type.
 318
 319         # simply use '#' for now.
 320         while 1:
 321             s = ti_read(self.tabixfile, self.iterator, &len)
 322             if s == NULL: raise StopIteration
 323             if s[0] != '#': break
 324
 325         retval = _charptr_to_str( s )
 326         return retval
 327
 328     def __dealloc__(self):
 329         if <void*>self.iterator != NULL:
 330             ti_iter_destroy(self.iterator)
 331
 332 cdef class TabixHeaderIterator:
 333     """return header lines.
 334     """
 335
 336     def __cinit__(self, Tabixfile tabixfile ):
 337
 338         assert tabixfile._isOpen()
 339
 340         # makes sure that samfile stays alive as long as the
 341         # iterator is alive.
 342         self.tabixfile = tabixfile.tabixfile
 343
 344         self.iterator = ti_query(self.tabixfile, NULL, 0, 0)
 345
 346         if <void*>self.iterator == NULL:
 347             raise ValueError("can't open header.\n")
 348
 349     def __iter__(self):
 350         return self
 351
 352     def __next__(self):
 353         """python version of next().
 354
 355         pyrex uses this non-standard name instead of next()
 356         """
 357
 358         cdef char * s
 359         cdef int len
 360
 361         # Getting the metachar is a pain as ti_index_t is incomplete type.
 362         # simply use '#' for now.
 363         s = ti_read(self.tabixfile, self.iterator, &len)
 364         if s == NULL: raise StopIteration
 365         # stop at first non-header line
 366         if s[0] != '#': raise StopIteration
 367
 368         return s
 369
 370     def __dealloc__(self):
 371         if <void*>self.iterator != NULL:
 372             ti_iter_destroy(self.iterator)
 373
 374
 375 #########################################################
 376 #########################################################
 377 #########################################################
 378 cdef class Parser:
 379     pass
 380
 381 cdef class asTuple(Parser):
 382     '''converts a :term:`tabix row` into a python tuple.
 383
 384     Access is by numeric index.
 385     '''
 386     def __call__(self, char * buffer, int len):
 387         cdef TabProxies.TupleProxy r
 388         r = TabProxies.TupleProxy()
 389         # need to copy - there were some
 390         # persistence issues with "present"
 391         r.copy( buffer, len )
 392         return r
 393
 394 cdef class asGTF(Parser):
 395     '''converts a :term:`tabix row` into a GTF record with the following
 396     fields:
 397
 398     contig
 399        contig
 400     feature
 401        feature
 402     source
 403        source
 404     start
 405        genomic start coordinate (0-based)
 406     end
 407        genomic end coordinate plus one (0-based)
 408     score
 409        feature score
 410     strand
 411        strand
 412     frame
 413        frame
 414     attributes
 415        attribute string.
 416
 417     GTF formatted entries also defined the attributes:
 418
 419     gene_id
 420        the gene identifier
 421     transcript_ind
 422        the transcript identifier
 423
 424     '''
 425     def __call__(self, char * buffer, int len):
 426         cdef TabProxies.GTFProxy r
 427         r = TabProxies.GTFProxy()
 428         r.copy( buffer, len )
 429         return r
 430
 431 cdef class asBed( Parser ):
 432     '''converts a :term:`tabix row` into a bed record
 433     with the following fields:
 434
 435     contig
 436        contig
 437     start
 438        genomic start coordinate (zero-based)
 439     end
 440        genomic end coordinate plus one (zero-based)
 441     name
 442        name of feature.
 443     score
 444        score of feature
 445     strand
 446        strand of feature
 447     thickStart
 448        thickStart
 449     thickEnd
 450        thickEnd
 451     itemRGB
 452        itemRGB
 453     blockCount
 454        number of bocks
 455     blockSizes
 456        ',' separated string of block sizes
 457     blockStarts
 458        ',' separated string of block genomic start positions
 459
 460     Only the first three fields are required. Additional
 461     fields are optional, but if one is defined, all the preceeding
 462     need to be defined as well.
 463
 464     '''
 465     def __call__(self, char * buffer, int len):
 466         cdef TabProxies.BedProxy r
 467         r = TabProxies.BedProxy()
 468         r.copy( buffer, len )
 469         return r
 470
 471 cdef class asVCF( Parser ):
 472     '''converts a :term:`tabix row` into a VCF record with
 473     the following fields:
 474
 475     contig
 476        contig
 477     pos
 478        chromosomal position, zero-based
 479     id
 480        id
 481     ref
 482        reference
 483     alt
 484        alt
 485     qual
 486        qual
 487     filter
 488        filter
 489     info
 490        info
 491     format
 492        format specifier.
 493
 494     Access to genotypes is via index::
 495
 496         contig = vcf.contig
 497         first_sample_genotype = vcf[0]
 498         second_sample_genotype = vcf[1]
 499
 500     '''
 501     def __call__(self, char * buffer, int len ):
 502         cdef TabProxies.VCFProxy r
 503         r = TabProxies.VCFProxy()
 504         r.copy( buffer, len )
 505         return r
 506
 507 #########################################################
 508 #########################################################
 509 #########################################################
 510 cdef class TabixIteratorParsed:
 511     """iterates over mapped reads in a region.
 512
 513     Returns parsed data.
 514     """
 515
 516     def __cinit__(self,
 517                   Tabixfile tabixfile,
 518                   int tid,
 519                   int start,
 520                   int end,
 521                   Parser parser ):
 522
 523         assert tabixfile._isOpen()
 524         self.parser = parser
 525
 526         # makes sure that samfile stays alive as long as the
 527         # iterator is alive.
 528         self.tabixfile = tabixfile.tabixfile
 529
 530         if tid < 0:
 531             # seek to start of file to ensure iteration is over
 532             # all entries.
 533             bgzf_seek( self.tabixfile.fp, 0, 0)
 534             self.iterator = ti_iter_first()
 535         else:
 536             self.iterator = ti_queryi(self.tabixfile, tid, start, end)
 537
 538         if <void*>self.iterator == NULL:
 539             raise ValueError("malformatted query or wrong sequence name.\n")
 540
 541     def __iter__(self):
 542         return self
 543
 544     def __next__(self):
 545         """python version of next().
 546
 547         pyrex uses this non-standard name instead of next()
 548         """
 549
 550         cdef char * s
 551         cdef int len
 552         while 1:
 553             s = ti_read(self.tabixfile, self.iterator, &len)
 554             if s == NULL: raise StopIteration
 555             # todo: read metachar from configuration
 556             if s[0] != '#': break
 557
 558         return self.parser(s, len)
 559
 560     def __dealloc__(self):
 561         if <void*>self.iterator != NULL:
 562             ti_iter_destroy(self.iterator)
 563
 564 def tabix_compress( filename_in,
 565                     filename_out,
 566                     force = False ):
 567     '''
 568     compress *filename_in* writing the output to *filename_out*.
 569
 570     Raise an IOError if *filename_out* already exists, unless *force* is set.
 571     '''
 572
 573     if not force and os.path.exists(filename_out ):
 574         raise IOError( "Filename '%s' already exists, use *force* to overwrite" % filename_out)
 575
 576     cdef int WINDOW_SIZE
 577     cdef int c, r
 578     cdef void * buffer
 579     cdef BGZF * fp
 580     cdef int fd_src
 581
 582     cdef int O_RDONLY
 583     O_RDONLY = os.O_RDONLY
 584
 585     WINDOW_SIZE = 64 * 1024
 586
 587     fn = _force_bytes(filename_out)
 588     fp = bgzf_open( fn, "w")
 589     if fp == NULL:
 590         raise IOError( "could not open '%s' for writing" )
 591
 592     fn = _force_bytes(filename_in)
 593     fd_src = open(fn, O_RDONLY)
 594     if fd_src == 0:
 595         raise IOError( "could not open '%s' for reading" )
 596
 597     buffer = malloc(WINDOW_SIZE)
 598     c = 1
 599
 600     while c > 0:
 601         c = read(fd_src, buffer, WINDOW_SIZE)
 602         r = bgzf_write(fp, buffer, c)
 603         if r < 0:
 604             free( buffer )
 605             raise OSError("writing failed")
 606
 607     free( buffer )
 608     r = bgzf_close(fp)
 609     if r < 0: raise OSError("writing failed")
 610
 611 def tabix_index( filename,
 612                  force = False,
 613                  seq_col = None,
 614                  start_col = None,
 615                  end_col = None,
 616                  preset = None,
 617                  meta_char = "#",
 618                  zerobased = False,
 619                 ):
 620     '''
 621     index tab-separated *filename* using tabix.
 622
 623     An existing index will not be overwritten unless
 624     *force* is set.
 625
 626     The index will be built from coordinates
 627     in columns *seq_col*, *start_col* and *end_col*.
 628
 629     The contents of *filename* have to be sorted by
 630     contig and position - the method does not check
 631     if the file is sorted.
 632
 633     Column indices are 0-based. Coordinates in the file
 634     are assumed to be 1-based.
 635
 636     If *preset* is provided, the column coordinates
 637     are taken from a preset. Valid values for preset
 638     are "gff", "bed", "sam", "vcf", psltbl", "pileup".
 639
 640     Lines beginning with *meta_char* and the first
 641     *line_skip* lines will be skipped.
 642
 643     If *filename* does not end in ".gz", it will be automatically
 644     compressed. The original file will be removed and only the
 645     compressed file will be retained.
 646
 647     If *filename* ends in *gz*, the file is assumed to be already
 648     compressed with bgzf.
 649
 650     returns the filename of the compressed data
 651     '''
 652
 653     if not os.path.exists(filename): raise IOError("No such file '%s'" % filename)
 654
 655     if not filename.endswith(".gz"):
 656         tabix_compress( filename, filename + ".gz", force = force )
 657         os.unlink( filename )
 658         filename += ".gz"
 659
 660     if not force and os.path.exists(filename + ".tbi" ):
 661         raise IOError( "Filename '%s.tbi' already exists, use *force* to overwrite" )
 662
 663     # columns (1-based)
 664     # preset-code, contig, start, end, metachar for commends, lines to ignore at beginning
 665     # 0 is a missing column
 666     preset2conf = {
 667         'gff' : ( 0, 1, 4, 5, ord('#'), 0 ),
 668         'bed' : ( 0x10000, 1, 2, 3, ord('#'), 0 ),
 669         'psltbl' : ( 0x10000, 15, 17, 18, ord('#'), 0 ),
 670         'sam' : ( 1, 3, 4, 0, ord('#'), 0 ),
 671         'vcf' : ( 2, 1, 2, 0, ord('#'), 0 ),
 672         'pileup': (3, 1, 2, 0, ord('#'), 0 ),
 673         }
 674
 675     if preset:
 676         try:
 677             conf_data = preset2conf[preset]
 678         except KeyError:
 679             raise KeyError( "unknown preset '%s', valid presets are '%s'" % (preset, ",".join(preset2conf.keys() )))
 680     else:
 681         if end_col == None: end_col = -1
 682         preset = 0
 683
 684         # note that tabix internally works with 0-based coordinates and open/closed intervals.
 685         # When using a preset, conversion is automatically taken care of.
 686         # Otherwise, the coordinates are assumed to be 1-based closed intervals and
 687         # -1 is subtracted from the start coordinate. To avoid doing this, set
 688         # the TI_FLAG_UCSC=0x10000 flag:
 689         if zerobased: preset = preset | 0x10000
 690
 691         conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0)
 692
 693     cdef ti_conf_t conf
 694     conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
 695
 696     fn = _my_encodeFilename( filename )
 697     ti_index_build( fn, &conf)
 698
 699     return filename
 700
 701 #########################################################
 702 #########################################################
 703 #########################################################
 704 ## Iterators for parsing through unindexed files.
 705 #########################################################
 706 ctypedef class tabix_inplace_iterator:
 707     '''iterate over ``infile``.
 708
 709     This iterator is not safe. If the :meth:`__next__()` method is called
 710     after ``infile`` is closed, the result is undefined (see ``fclose()``).
 711
 712     The iterator might either raise a StopIteration or segfault.
 713     '''
 714
 715
 716     def __cinit__(self, infile, int buffer_size = 65536 ):
 717
 718         cdef int fd = PyObject_AsFileDescriptor( infile )
 719         if fd == -1: raise ValueError( "I/O operation on closed file." )
 720         self.infile = fdopen( fd, 'r')
 721
 722         if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
 723
 724         self.buffer = <char*>malloc( buffer_size )
 725         self.size = buffer_size
 726
 727     def __iter__(self):
 728         return self
 729
 730     cdef __cnext__(self):
 731
 732         cdef char * b
 733         cdef size_t nbytes
 734         b = self.buffer
 735         r = self.Parser()
 736
 737         while not feof( self.infile ):
 738             nbytes = getline( &b, &self.size, self.infile)
 739
 740             # stop at first error or eof
 741             if (nbytes == -1): break
 742             # skip comments
 743             if (b[0] == '#'): continue
 744
 745             # skip empty lines
 746             if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
 747
 748             # make sure that entry is complete
 749             if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
 750                 result = b
 751                 raise ValueError( "incomplete line at %s" % result )
 752
 753             # make sure that this goes fully through C
 754             # otherwise buffer is copied to/from a
 755             # Python object causing segfaults as
 756             # the wrong memory is freed
 757             r.present( b, nbytes )
 758             return r
 759
 760         raise StopIteration
 761
 762     def __dealloc__(self):
 763         free(self.buffer)
 764
 765     def __next__(self):
 766         return self.__cnext__()
 767
 768 ctypedef class tabix_copy_iterator:
 769     '''iterate over ``infile``.
 770
 771     This iterator is not save. If the :meth:`__next__()` method is called
 772     after ``infile`` is closed, the result is undefined (see ``fclose()``).
 773
 774     The iterator might either raise a StopIteration or segfault.
 775     '''
 776
 777     def __cinit__(self, infile, Parser parser ):
 778
 779         cdef int fd = PyObject_AsFileDescriptor( infile )
 780         if fd == -1: raise ValueError( "I/O operation on closed file." )
 781         self.infile = fdopen( fd, 'r')
 782         if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
 783         self.parser = parser
 784
 785     def __iter__(self):
 786         return self
 787
 788     cdef __cnext__(self):
 789
 790         cdef char * b
 791         cdef size_t nbytes
 792         cdef int x
 793
 794         b = NULL
 795
 796         while not feof( self.infile ):
 797
 798             # getline allocates on demand
 799             # return number of characters read excluding null byte
 800             nbytes = getline( &b, &nbytes, self.infile)
 801             # stop at first error
 802             if (nbytes == -1): break
 803             # skip comments
 804             if (b[0] == '#'): continue
 805
 806             # skip empty lines
 807             if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
 808
 809             # make sure that entry is complete
 810             if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
 811                 result = b
 812                 free(b)
 813                 raise ValueError( "incomplete line at %s" % result )
 814
 815             # make sure that this goes fully through C
 816             # otherwise buffer is copied to/from a
 817             # Python object causing segfaults as
 818             # the wrong memory is freed
 819             # -1 to remove the new-line character
 820             return self.parser(b, nbytes)
 821
 822         free(b)
 823         raise StopIteration
 824
 825     def __next__(self):
 826         return self.__cnext__()
 827
 828 class tabix_generic_iterator:
 829     '''iterate over ``infile``.
 830
 831     Permits the use of file-like objects for example from the gzip module.
 832     '''
 833     def __init__(self, infile, parser ):
 834
 835         self.infile = infile
 836         if self.infile.closed: raise ValueError( "I/O operation on closed file." )
 837         self.parser = parser
 838
 839     def __iter__(self):
 840         return self
 841
 842     # cython version - required for python 3
 843     def __next__(self):
 844
 845         cdef char * b, * cpy
 846         cdef size_t nbytes
 847         while 1:
 848
 849             line = self.infile.readline()
 850             if not line: break
 851
 852             s = _force_bytes( line )
 853             b = s
 854             nbytes = len( line )
 855             assert b[nbytes] == '\0'
 856
 857             # skip comments
 858             if (b[0] == '#'): continue
 859
 860             # skip empty lines
 861             if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
 862
 863             # make sure that entry is complete
 864             if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
 865                 raise ValueError( "incomplete line at %s" % line )
 866
 867             # create a copy
 868             cpy = <char*>malloc(nbytes+1)
 869             if cpy == NULL: raise MemoryError()
 870             memcpy( cpy, b, nbytes+1)
 871
 872             return self.parser(cpy, nbytes)
 873
 874         raise StopIteration
 875
 876     # python version - required for python 2.7
 877     def next(self):
 878         return self.__next__()
 879
 880 def tabix_iterator( infile, parser ):
 881     """return an iterator over all entries in a file."""
 882     return tabix_generic_iterator( infile, parser )
 883     # file objects can use C stdio
 884     # used to be: isinstance( infile, file):
 885     # if PYTHON3:
 886     #     if isinstance( infile, io.IOBase ):
 887     #         return tabix_copy_iterator( infile, parser )
 888     #     else:
 889     #         return tabix_generic_iterator( infile, parser )
 890     # else:
 891 #        if isinstance( infile, file ):
 892 #            return tabix_copy_iterator( infile, parser )
 893 #        else:
 894 #            return tabix_generic_iterator( infile, parser )
 895
 896 __all__ = ["tabix_index",
 897            "tabix_compress",
 898            "Tabixfile",
 899            "asTuple",
 900            "asGTF",
 901            "asVCF",
 902            "asBed",
 903            "tabix_iterator",
 904            "tabix_inplace_iterator"
 905            ]