X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=pysam.git;a=blobdiff_plain;f=pysam%2Fcsamtools.pyx;fp=pysam%2Fcsamtools.pyx;h=e94e0e626bde58542b961a5a6ff9fe23bae46675;hp=242e68a402474774b414561186de6cdb6050144e;hb=7f85f82bcb01134b7398f0d26b9d1b9a21a3e66c;hpb=1880a0e7e45766a58e2b42c59d6993cb543e4538 diff --git a/pysam/csamtools.pyx b/pysam/csamtools.pyx index 242e68a..e94e0e6 100644 --- a/pysam/csamtools.pyx +++ b/pysam/csamtools.pyx @@ -167,6 +167,14 @@ class StderrStore(): self.stderr_save = Outs( sys.stderr.fileno() ) self.stderr_save.setfd( self.stderr_h ) + def readAndRelease( self ): + self.stderr_save.restore() + lines = [] + if os.path.exists(self.stderr_f): + lines = open( self.stderr_f, "r" ).readlines() + os.remove( self.stderr_f ) + return lines + def release(self): self.stderr_save.restore() if os.path.exists(self.stderr_f): @@ -316,7 +324,7 @@ cdef class Samfile: else: # build header from a target names and lengths - assert referencenames and referencelengths, "either supply options `template`, `header` or both `refernencenames` and `referencelengths` for writing" + assert referencenames and referencelengths, "either supply options `template`, `header` or both `referencenames` and `referencelengths` for writing" assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences" # allocate and fill header @@ -361,7 +369,13 @@ cdef class Samfile: store = StderrStore() self.samfile = samopen( filename, mode, NULL ) - store.release() + result = store.readAndRelease() + # test for specific messages as open also outputs status messages + # that can be ignored. + if "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n" in result: + raise ValueError( "invalid BAM binary header (is this a BAM file?)" ) + elif '[samopen] no @SQ lines in the header.\n' in result: + raise ValueError( "no @SQ lines in the header (is this a SAM file?)") if self.samfile == NULL: raise IOError("could not open file `%s`" % filename ) @@ -385,6 +399,7 @@ cdef class Samfile: def getrname( self, tid ): '''(tid ) convert numerical :term:`tid` into :ref:`reference` name.''' + if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) if not 0 <= tid < self.samfile.header.n_targets: raise ValueError( "tid out of range 0<=tid<%i" % self.samfile.header.n_targets ) return self.samfile.header.target_name[tid] @@ -404,6 +419,9 @@ cdef class Samfile: Note that regions are 1-based, while start,end are python coordinates. ''' + # This method's main objective is to translate from a reference to a tid. + # For now, it calls bam_parse_region, which is clumsy. Might be worth + # implementing it all in pysam (makes use of khash). cdef int rtid cdef int rstart @@ -416,14 +434,15 @@ cdef class Samfile: # translate to a region if reference: if start != None and end != None: + if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) ) region = "%s:%i-%i" % (reference, start+1, end) else: region = reference if region: - store = StderrStore() + # this function might be called often (multiprocessing) + # thus avoid using StderrStore, see issue 46. bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend) - store.release() if rtid < 0: raise ValueError( "invalid region `%s`" % region ) if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) ) if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart ) @@ -442,6 +461,8 @@ cdef class Samfile: def tell( self ): '''return current file position''' + if not self._isOpen(): + raise ValueError( "I/O operation on closed file" ) if not self.isbam: raise NotImplementedError("seek only available in bam files") @@ -605,6 +626,9 @@ cdef class Samfile: return the number of bytes written. ''' + if not self._isOpen(): + raise ValueError( "I/O operation on closed file" ) + return samwrite( self.samfile, read._delegate ) def __enter__(self): @@ -617,11 +641,13 @@ cdef class Samfile: property nreferences: '''number of :term:`reference` sequences in the file.''' def __get__(self): + if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) return self.samfile.header.n_targets property references: """tuple with the names of :term:`reference` sequences.""" def __get__(self): + if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) t = [] for x from 0 <= x < self.samfile.header.n_targets: t.append( self.samfile.header.target_name[x] ) @@ -631,6 +657,7 @@ cdef class Samfile: """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as :attr:`pysam.Samfile.reference` """ def __get__(self): + if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) t = [] for x from 0 <= x < self.samfile.header.n_targets: t.append( self.samfile.header.target_len[x] ) @@ -639,6 +666,7 @@ cdef class Samfile: property text: '''full contents of the :term:`sam file` header as a string.''' def __get__(self): + if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) return PyString_FromStringAndSize(self.samfile.header.text, self.samfile.header.l_text) property header: @@ -646,6 +674,8 @@ cdef class Samfile: a two-level dictionary. ''' def __get__(self): + if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) + result = {} if self.samfile.header.text != NULL: @@ -750,6 +780,7 @@ cdef class Samfile: return dest def __iter__(self): + if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) return self cdef bam1_t * getCurrent( self ): @@ -797,7 +828,9 @@ cdef class Fastafile: return self.fastafile != NULL def __len__(self): - assert self.fastafile != NULL + if self.fastafile == NULL: + raise ValueError( "calling len() on closed file" ) + return faidx_fetch_nseq(self.fastafile) def _open( self, @@ -841,7 +874,7 @@ cdef class Fastafile: if not self._isOpen(): raise ValueError( "I/O operation on closed file" ) - cdef int len, max_pos + cdef int length, max_pos cdef char * seq max_pos = 2 << 29 @@ -855,23 +888,25 @@ cdef class Fastafile: # valid ranges are from 0 to 2^29-1 if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start ) if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end ) - - seq = faidx_fetch_seq(self.fastafile, reference, + seq = faidx_fetch_seq(self.fastafile, + reference, start, - end-1, &len) + end-1, + &length) else: # samtools adds a '\0' at the end - seq = fai_fetch( self.fastafile, region, &len ) + seq = fai_fetch( self.fastafile, region, &length ) # copy to python if seq == NULL: return "" else: - result = seq - # clean up - free(seq) - - return result + try: + py_seq = PyString_FromStringAndSize(seq, length) + finally: + free(seq) + + return py_seq ########################################################################### ########################################################################### @@ -903,8 +938,11 @@ cdef class IteratorRow: def __cinit__(self, Samfile samfile, int tid, int beg, int end ): - assert samfile._isOpen() - assert samfile._hasIndex() + if not samfile._isOpen(): + raise ValueError( "I/O operation on closed file" ) + + if not samfile._hasIndex(): + raise ValueError( "no index available for pileup" ) # makes sure that samfile stays alive as long as the # iterator is alive @@ -958,7 +996,8 @@ cdef class IteratorRowAll: def __cinit__(self, Samfile samfile): - assert samfile._isOpen() + if not samfile._isOpen(): + raise ValueError( "I/O operation on closed file" ) if samfile.isbam: mode = "rb" else: mode = "r" @@ -1480,6 +1519,7 @@ cdef class AlignedRead: property tags: """the tags in the AUX field. + This property permits convenience access to the tags. Changes it the returned list will not update the tags automatically. Instead, @@ -1493,63 +1533,49 @@ cdef class AlignedRead: cdef char * ctag cdef bam1_t * src cdef uint8_t * s - cdef char tpe + cdef char auxtag[3] + cdef char auxtype src = self._delegate if src.l_aux == 0: return None s = bam1_aux( src ) result = [] - ctag = calloc( 3, sizeof(char) ) - cdef int x + auxtag[2] = 0 while s < (src.data + src.data_len): # get tag - ctag[0] = s[0] - ctag[1] = s[1] - pytag = ctag - + auxtag[0] = s[0] + auxtag[1] = s[1] s += 2 + auxtype = s[0] - # convert type - is there a better way? - ctag[0] = s[0] - ctag[1] = 0 - pytype = ctag - # get type and value - # how do I do char literal comparison in cython? - # the code below works (i.e, is C comparison) - tpe = toupper(s[0]) - if tpe == 'S': + if auxtype in ('c', 'C'): value = bam_aux2i(s) - s += 2 - elif tpe == 'I': + s += 1 + elif auxtype in ('s', 'S'): value = bam_aux2i(s) + s += 2 + elif auxtype in ('i', 'I'): + value = bam_aux2i(s) s += 4 - elif tpe == 'F': + elif auxtype == 'f': value = bam_aux2f(s) s += 4 - elif tpe == 'D': + elif auxtype == 'd': value = bam_aux2d(s) s += 8 - elif tpe == 'C': - value = bam_aux2i(s) - s += 1 - elif tpe == 'A': - # there might a more efficient way - # to convert a char into a string + elif auxtype == 'A': value = "%c" % bam_aux2A(s) s += 1 - elif tpe == 'Z': + elif auxtype in ('Z', 'H'): value = bam_aux2Z(s) # +1 for NULL terminated string s += len(value) + 1 - - # skip over type + # s += 1 + + result.append( (auxtag, value) ) - # ignore pytype - result.append( (pytag, value) ) - - free( ctag ) return result def __set__(self, tags): @@ -1557,51 +1583,57 @@ cdef class AlignedRead: cdef bam1_t * src cdef uint8_t * s cdef uint8_t * new_data + cdef char * temp cdef int guessed_size, control_size + cdef int max_size, size, offset + src = self._delegate - cdef int max_size, size max_size = 4000 - - # map samtools code to python.struct code and byte size - buffer = ctypes.create_string_buffer(max_size) - offset = 0 - for pytag, value in tags: - t = type(value) - if t == types.FloatType: - fmt = "= -127: fmt, pytype = "= -32767: fmt, pytype = " 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value ) - else: fmt, pytype = "= -127: fmt, pytype = "= -32767: fmt, pytype = " 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value ) + else: fmt, pytype = " max_size: - raise NotImplementedError("tags field too large") - - struct.pack_into( fmt, - buffer, - offset, - pytag[0], - pytag[1], - pytype, - value ) - offset += size + # Note: hex strings (H) are not supported yet + if len(value) == 1: + fmt, pytype = " max_size: + raise NotImplementedError("tags field too large") + + struct.pack_into( fmt, + buffer, + offset, + pytag[0], + pytag[1], + pytype, + value ) + offset += size # delete the old data and allocate new + # if offset == 0, the aux field will be + # empty pysam_bam_update( src, src.l_aux, offset, @@ -1609,15 +1641,15 @@ cdef class AlignedRead: src.l_aux = offset - if offset == 0: return + # copy data only if there is any + if offset != 0: - # get location of new data - s = bam1_aux( src ) + # get location of new data + s = bam1_aux( src ) - # check if there is direct path from buffer.raw to tmp - cdef char * temp - temp = buffer.raw - memcpy( s, temp, offset ) + # check if there is direct path from buffer.raw to tmp + temp = buffer.raw + memcpy( s, temp, offset ) property flag: """properties flag"""