Imported Upstream version 0.3.1

[pysam.git] / pysam / csamtools.pyx
diff --git a/pysam/csamtools.pyx b/pysam/csamtools.pyx

index 242e68a402474774b414561186de6cdb6050144e..e94e0e626bde58542b961a5a6ff9fe23bae46675 100644 (file)
--- a/pysam/csamtools.pyx
+++ b/pysam/csamtools.pyx
@@ -167,6 +167,14 @@ class StderrStore():
          self.stderr_save = Outs( sys.stderr.fileno() )
          self.stderr_save.setfd( self.stderr_h )
          
+    def readAndRelease( self ):
+        self.stderr_save.restore()
+        lines = []
+        if os.path.exists(self.stderr_f):
+            lines = open( self.stderr_f, "r" ).readlines()
+            os.remove( self.stderr_f )
+        return lines
+
      def release(self):
          self.stderr_save.restore()
          if os.path.exists(self.stderr_f):
@@ -316,7 +324,7 @@ cdef class Samfile:
  
              else:
                  # build header from a target names and lengths
-                assert referencenames and referencelengths, "either supply options `template`, `header` or  both `refernencenames` and `referencelengths` for writing"
+                assert referencenames and referencelengths, "either supply options `template`, `header` or  both `referencenames` and `referencelengths` for writing"
                  assert len(referencenames) == len(referencelengths), "unequal names and lengths of reference sequences"
  
                  # allocate and fill header
@@ -361,7 +369,13 @@ cdef class Samfile:
  
              store = StderrStore()
              self.samfile = samopen( filename, mode, NULL )
-            store.release()
+            result = store.readAndRelease()
+            # test for specific messages as open also outputs status messages
+            # that can be ignored.
+            if "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n" in result:
+                raise ValueError( "invalid BAM binary header (is this a BAM file?)" )
+            elif '[samopen] no @SQ lines in the header.\n' in result:
+                raise ValueError( "no @SQ lines in the header (is this a SAM file?)")
  
          if self.samfile == NULL:
              raise IOError("could not open file `%s`" % filename )
@@ -385,6 +399,7 @@ cdef class Samfile:
      def getrname( self, tid ):
          '''(tid )
          convert numerical :term:`tid` into :ref:`reference` name.'''
+        if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
          if not 0 <= tid < self.samfile.header.n_targets:
              raise ValueError( "tid out of range 0<=tid<%i" % self.samfile.header.n_targets )
          return self.samfile.header.target_name[tid]
@@ -404,6 +419,9 @@ cdef class Samfile:
  
          Note that regions are 1-based, while start,end are python coordinates.
          '''
+        # This method's main objective is to translate from a reference to a tid. 
+        # For now, it calls bam_parse_region, which is clumsy. Might be worth
+        # implementing it all in pysam (makes use of khash).
          
          cdef int rtid
          cdef int rstart
@@ -416,14 +434,15 @@ cdef class Samfile:
          # translate to a region
          if reference:
              if start != None and end != None:
+                if start > end: raise ValueError( 'invalid region: start (%i) > end (%i)' % (start, end) )
                  region = "%s:%i-%i" % (reference, start+1, end)
              else:
                  region = reference
  
          if region:
-            store = StderrStore()
+            # this function might be called often (multiprocessing)
+            # thus avoid using StderrStore, see issue 46.
              bam_parse_region( self.samfile.header, region, &rtid, &rstart, &rend)        
-            store.release()
              if rtid < 0: raise ValueError( "invalid region `%s`" % region )
              if rstart > rend: raise ValueError( 'invalid region: start (%i) > end (%i)' % (rstart, rend) )
              if not 0 <= rstart < max_pos: raise ValueError( 'start out of range (%i)' % rstart )
@@ -442,6 +461,8 @@ cdef class Samfile:
  
      def tell( self ):
          '''return current file position'''
+        if not self._isOpen():
+            raise ValueError( "I/O operation on closed file" )
          if not self.isbam:
              raise NotImplementedError("seek only available in bam files")
  
@@ -605,6 +626,9 @@ cdef class Samfile:
  
          return the number of bytes written.
          '''
+        if not self._isOpen():
+            raise ValueError( "I/O operation on closed file" )
+
          return samwrite( self.samfile, read._delegate )
  
      def __enter__(self):
@@ -617,11 +641,13 @@ cdef class Samfile:
      property nreferences:
          '''number of :term:`reference` sequences in the file.'''
          def __get__(self):
+            if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
              return self.samfile.header.n_targets
  
      property references:
          """tuple with the names of :term:`reference` sequences."""
          def __get__(self): 
+            if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
              t = []
              for x from 0 <= x < self.samfile.header.n_targets:
                  t.append( self.samfile.header.target_name[x] )
@@ -631,6 +657,7 @@ cdef class Samfile:
          """tuple of the lengths of the :term:`reference` sequences. The lengths are in the same order as :attr:`pysam.Samfile.reference`
          """
          def __get__(self): 
+            if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
              t = []
              for x from 0 <= x < self.samfile.header.n_targets:
                  t.append( self.samfile.header.target_len[x] )
@@ -639,6 +666,7 @@ cdef class Samfile:
      property text:
          '''full contents of the :term:`sam file` header as a string.'''
          def __get__(self):
+            if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
              return PyString_FromStringAndSize(self.samfile.header.text, self.samfile.header.l_text)
  
      property header:
@@ -646,6 +674,8 @@ cdef class Samfile:
          a two-level dictionary.
          '''
          def __get__(self):
+            if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
+
              result = {}
  
              if self.samfile.header.text != NULL:
@@ -750,6 +780,7 @@ cdef class Samfile:
          return dest
  
      def __iter__(self):
+        if not self._isOpen(): raise ValueError( "I/O operation on closed file" )
          return self 
  
      cdef bam1_t * getCurrent( self ):
@@ -797,7 +828,9 @@ cdef class Fastafile:
          return self.fastafile != NULL
  
      def __len__(self):
-        assert self.fastafile != NULL
+        if self.fastafile == NULL:
+            raise ValueError( "calling len() on closed file" )
+
          return faidx_fetch_nseq(self.fastafile)
  
      def _open( self, 
@@ -841,7 +874,7 @@ cdef class Fastafile:
          if not self._isOpen():
              raise ValueError( "I/O operation on closed file" )
  
-        cdef int len, max_pos
+        cdef int length, max_pos
          cdef char * seq
          max_pos = 2 << 29
  
@@ -855,23 +888,25 @@ cdef class Fastafile:
              # valid ranges are from 0 to 2^29-1
              if not 0 <= start < max_pos: raise ValueError( 'start out of range (%i)' % start )
              if not 0 <= end < max_pos: raise ValueError( 'end out of range (%i)' % end )
-
-            seq = faidx_fetch_seq(self.fastafile, reference, 
+            seq = faidx_fetch_seq(self.fastafile, 
+                                  reference, 
                                    start,
-                                  end-1, &len)
+                                  end-1, 
+                                  &length)
          else:
              # samtools adds a '\0' at the end
-            seq = fai_fetch( self.fastafile, region, &len )
+            seq = fai_fetch( self.fastafile, region, &length )
  
          # copy to python
          if seq == NULL: 
              return ""
          else:
-            result = seq
-            # clean up
-            free(seq)
-        
-        return result
+            try:
+                py_seq = PyString_FromStringAndSize(seq, length)
+            finally:
+                free(seq)
+
+        return py_seq
  
  ###########################################################################
  ###########################################################################
@@ -903,8 +938,11 @@ cdef class IteratorRow:
  
      def __cinit__(self, Samfile samfile, int tid, int beg, int end ):
  
-        assert samfile._isOpen()
-        assert samfile._hasIndex()
+        if not samfile._isOpen():
+            raise ValueError( "I/O operation on closed file" )
+        
+        if not samfile._hasIndex():
+            raise ValueError( "no index available for pileup" )
          
          # makes sure that samfile stays alive as long as the
          # iterator is alive
@@ -958,7 +996,8 @@ cdef class IteratorRowAll:
  
      def __cinit__(self, Samfile samfile):
  
-        assert samfile._isOpen()
+        if not samfile._isOpen():
+            raise ValueError( "I/O operation on closed file" )
  
          if samfile.isbam: mode = "rb"
          else: mode = "r"
@@ -1480,6 +1519,7 @@ cdef class AlignedRead:
  
      property tags:
          """the tags in the AUX field.
+
          This property permits convenience access to 
          the tags. Changes it the returned list will
          not update the tags automatically. Instead,
@@ -1493,63 +1533,49 @@ cdef class AlignedRead:
              cdef char * ctag
              cdef bam1_t * src
              cdef uint8_t * s
-            cdef char tpe
+            cdef char auxtag[3]
+            cdef char auxtype
              
              src = self._delegate
              if src.l_aux == 0: return None
              
              s = bam1_aux( src )
              result = []
-            ctag = <char*>calloc( 3, sizeof(char) )
-            cdef int x
+            auxtag[2] = 0
              while s < (src.data + src.data_len):
                  # get tag
-                ctag[0] = s[0]
-                ctag[1] = s[1]
-                pytag = ctag
-
+                auxtag[0] = s[0]
+                auxtag[1] = s[1]
                  s += 2
+                auxtype = s[0]
  
-                # convert type - is there a better way?
-                ctag[0] = s[0]
-                ctag[1] = 0
-                pytype = ctag
-                # get type and value 
-                # how do I do char literal comparison in cython?
-                # the code below works (i.e, is C comparison)
-                tpe = toupper(s[0])
-                if tpe == 'S':
+                if auxtype in ('c', 'C'):
                      value = <int>bam_aux2i(s)            
-                    s += 2
-                elif tpe == 'I':
+                    s += 1
+                elif auxtype in ('s', 'S'):
                      value = <int>bam_aux2i(s)            
+                    s += 2
+                elif auxtype in ('i', 'I'):
+                    value = <float>bam_aux2i(s)
                      s += 4
-                elif tpe == 'F':
+                elif auxtype == 'f':
                      value = <float>bam_aux2f(s)
                      s += 4
-                elif tpe == 'D':
+                elif auxtype == 'd':
                      value = <double>bam_aux2d(s)
                      s += 8
-                elif tpe == 'C':
-                    value = <int>bam_aux2i(s)
-                    s += 1
-                elif tpe == 'A':
-                    # there might a more efficient way
-                    # to convert a char into a string
+                elif auxtype == 'A':
                      value = "%c" % <char>bam_aux2A(s)
                      s += 1
-                elif tpe == 'Z':
+                elif auxtype in ('Z', 'H'):
                      value = <char*>bam_aux2Z(s)
                      # +1 for NULL terminated string
                      s += len(value) + 1
-
-                # skip over type
+                 # 
                  s += 1
+  
+                result.append( (auxtag, value) )
  
-                # ignore pytype
-                result.append( (pytag, value) )
-
-            free( ctag )
              return result
  
          def __set__(self, tags):
@@ -1557,51 +1583,57 @@ cdef class AlignedRead:
              cdef bam1_t * src
              cdef uint8_t * s
              cdef uint8_t * new_data
+            cdef char * temp 
              cdef int guessed_size, control_size
+            cdef int max_size, size, offset
+
              src = self._delegate
-            cdef int max_size, size
              max_size = 4000
-
-            # map samtools code to python.struct code and byte size
-            buffer = ctypes.create_string_buffer(max_size)
-
              offset = 0
-            for pytag, value in tags:
-                t = type(value)
-                if t == types.FloatType:
-                    fmt = "<cccf"
-                elif t == types.IntType:
-                    if value < 0:
-                        if value >= -127: fmt, pytype = "<cccb", 'c'
-                        elif value >= -32767: fmt, pytype = "<ccch", 's'
-                        elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
-                        else: fmt, ctype = "<ccci", 'i'[0]
-                    else:
-                        if value <= 255: fmt, pytype = "<cccB", 'C'
-                        elif value <= 65535: fmt, pytype = "<cccH", 'S'
-                        elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
-                        else: fmt, pytype = "<cccI", 'I'
-                else:
-                    # Note: hex strings (H) are not supported yet
-                    if len(value) == 1:
-                        fmt, pytype = "<cccc", 'A'
+
+            if tags != None: 
+
+                # map samtools code to python.struct code and byte size
+                buffer = ctypes.create_string_buffer(max_size)
+
+                for pytag, value in tags:
+                    t = type(value)
+                    if t == types.FloatType:
+                        fmt = "<cccf"
+                    elif t == types.IntType:
+                        if value < 0:
+                            if value >= -127: fmt, pytype = "<cccb", 'c'
+                            elif value >= -32767: fmt, pytype = "<ccch", 's'
+                            elif value < -2147483648: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
+                            else: fmt, ctype = "<ccci", 'i'[0]
+                        else:
+                            if value <= 255: fmt, pytype = "<cccB", 'C'
+                            elif value <= 65535: fmt, pytype = "<cccH", 'S'
+                            elif value > 4294967295: raise ValueError( "integer %i out of range of BAM/SAM specification" % value )
+                            else: fmt, pytype = "<cccI", 'I'
                      else:
-                        fmt, pytype = "<ccc%is" % (len(value)+1), 'Z'
-
-                size = struct.calcsize(fmt)
-                if offset + size > max_size:
-                    raise NotImplementedError("tags field too large")
-
-                struct.pack_into( fmt,
-                                  buffer,
-                                  offset,
-                                  pytag[0],
-                                  pytag[1],
-                                  pytype,
-                                  value )
-                offset += size
+                        # Note: hex strings (H) are not supported yet
+                        if len(value) == 1:
+                            fmt, pytype = "<cccc", 'A'
+                        else:
+                            fmt, pytype = "<ccc%is" % (len(value)+1), 'Z'
+
+                    size = struct.calcsize(fmt)
+                    if offset + size > max_size:
+                        raise NotImplementedError("tags field too large")
+
+                    struct.pack_into( fmt,
+                                      buffer,
+                                      offset,
+                                      pytag[0],
+                                      pytag[1],
+                                      pytype,
+                                      value )
+                    offset += size
              
              # delete the old data and allocate new
+            # if offset == 0, the aux field will be 
+            # empty
              pysam_bam_update( src, 
                                src.l_aux,
                                offset,
@@ -1609,15 +1641,15 @@ cdef class AlignedRead:
              
              src.l_aux = offset
  
-            if offset == 0: return
+            # copy data only if there is any
+            if offset != 0:
  
-            # get location of new data
-            s = bam1_aux( src )            
+                # get location of new data
+                s = bam1_aux( src )            
              
-            # check if there is direct path from buffer.raw to tmp
-            cdef char * temp 
-            temp = buffer.raw
-            memcpy( s, temp, offset )            
+                # check if there is direct path from buffer.raw to tmp
+                temp = buffer.raw
+                memcpy( s, temp, offset )            
  
      property flag: 
          """properties flag"""