first pass cleanup of cistematic/genomes; change bamPreprocessing

[erange.git] / commoncode.py
diff --git a/commoncode.py b/commoncode.py

index 821936a30bc2524f0aa1664de1a66ecb8ae47fd6..0e4fdbbb7229b5f0c7689218dd9f1cbf2ba40b55 100755 (executable)
--- a/commoncode.py
+++ b/commoncode.py
@@ -1,8 +1,3 @@
-#
-#  commoncode.py
-#  ENRAGE
-#
-
  import ConfigParser
  import os
  import string
@@ -15,7 +10,10 @@ from cistematic.genomes import Genome
  import Region
  
  commoncodeVersion = 5.6
-currentRDSversion = 2.0
+
+#TODO: This is a function dumping ground - needs to be reworked
+class ErangeError(Exception):
+    pass
  
  
  def getReverseComplement(base):
@@ -51,6 +49,78 @@ def writeLog(logFile, messenger, message):
      logfile.close()
  
  
+def getChromInfoList(chrominfo):
+    chromInfoList = []
+    linelist = open(chrominfo)
+    for line in linelist:
+        fields = line.strip().split('\t')
+        chr = fields[0]
+        start = 0
+        end = int(fields[1])
+        chromInfoList.append((chr,start,end))
+
+    linelist.close()
+
+    return chromInfoList
+
+
+# BAM file support functions
+def isSpliceEntry(cigarTupleList):
+    isSplice = False
+    if cigarTupleList is not None:
+        for operation,length in cigarTupleList:
+            if operation == 3:
+                isSplice = True
+                break
+
+    return isSplice
+
+
+def addPairIDtoReadID(alignedread):
+    ID = alignedread.qname
+    if alignedread.is_read1:
+        ID = ID + '/1'
+
+    if alignedread.is_read2:
+        ID = ID + '/2'
+
+    return ID
+
+
+def getHeaderComment(bamHeader, commentKey):
+    for comment in bamHeader["CO"]:
+        fields = comment.split("\t")
+        if fields[0] == commentKey:
+            return fields[1]
+
+    raise KeyError
+
+
+def getReadSense(read):
+    if read.is_reverse:
+        sense = "-"
+    else:
+        sense = "+"
+
+    return sense
+
+#TODO: is this where we need to take into account the NH > 10 restriction?
+def countThisRead(read, countUniqs, countMulti, countSplices, sense):
+    countRead = True
+
+    if sense in ['-', '+'] and sense != getReadSense(read):
+        countRead = False
+    elif not countSplices and isSpliceEntry(read.cigar):
+        countRead = False
+    elif not countUniqs and read.opt('NH') == 1:
+        countRead = False
+    elif not countMulti and read.opt('NH') > 1:
+        countRead = False
+
+    return countRead
+
+
+# Cistematic functions
  def getGeneInfoDict(genome, cache=False):
      idb = geneinfoDB(cache=cache)
      if genome == "dmelanogaster":
@@ -65,16 +135,17 @@ def getGeneAnnotDict(genome, inRAM=False):
      return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
  
  
-def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False):
-    hg = Genome(genome, inRAM=inRAM)
+def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
+    genome = Genome(genomeName, inRAM=inRAM)
      if extendGenome != "":
-        hg.extendFeatures(extendGenome, replace=replaceModels)
+        genome.extendFeatures(extendGenome, replace=replaceModels)
  
-    geneannotDict = hg.allAnnotInfo()
+    geneannotDict = genome.allAnnotInfo()
  
      return geneannotDict
  
  
+# Configuration File Support
  def getConfigParser(fileList=[]):
      configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
      for filename in fileList:
@@ -131,6 +202,7 @@ def getAllConfigSectionOptions(parser, section):
      return setting
  
  
+# All the Other Stuff from before
  def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
                       fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
                       doMerge=True, keepPeak=False, returnTop=0):
@@ -194,6 +266,7 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
      #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
      #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
      #      be merged with A but that will exit the loop and the merge with C will be missed.
+    #TODO: Are we going to require sorted region files to have this even work?
      for regionEntry in regionList:
          if regionEntry[0] == "#":
              if "pvalue" in regionEntry:
@@ -336,7 +409,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
      """
  
      if shift == "auto":
-        shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
+        shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
  
      seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
  
@@ -346,7 +419,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
          smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
  
      topPos = getPeakPositionList(smoothArray, length)
-    peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
+    peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
  
      if leftPlus:
          numLeftPlus = 0
@@ -366,36 +439,33 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
      return peak
  
  
-def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
+def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
      bestShift = 0
      lowestScore = 20000000000
      for testShift in xrange(maxShift + 1):
          shiftArray = array("f", [0.] * length)
-        for read in hitList:
+        for read in readList:
              currentpos = read["start"] - start
              if read["sense"] == "+":
-                currentpos += testShift
+                senseModifier = 1
              else:
-                currentpos -= testShift
+                senseModifier = -1
  
+            currentpos += testShift * senseModifier
              if (currentpos < 1) or (currentpos >= length):
                  continue
  
-            if doWeight:
+            if useWeight:
                  weight = read["weight"]
              else:
                  weight = 1.0
  
-            if read["sense"] == "+":
-                shiftArray[currentpos] += weight
-            else:
-                shiftArray[currentpos] -= weight
+            shiftArray[currentpos] += weight * senseModifier
  
          currentScore = 0
          for score in shiftArray:
              currentScore += abs(score)
  
-        print currentScore
          if currentScore < lowestScore:
              bestShift = testShift
              lowestScore = currentScore
@@ -432,7 +502,7 @@ def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, left
              hitIndex += 1
              currentpos += 1
  
-        while hitIndex < readlen and  currentpos < length:
+        while hitIndex < readlen and currentpos < length:
              seqArray[currentpos] += weight
              hitIndex += 1
              currentpos += 1
@@ -450,7 +520,7 @@ def getPeakPositionList(smoothArray, length):
          if topNucleotide < smoothArray[currentpos]:
              topNucleotide = smoothArray[currentpos]
              peakList = [currentpos]
-        elif topNucleotide  == smoothArray[currentpos]:
+        elif topNucleotide == smoothArray[currentpos]:
              peakList.append(currentpos)
  
      return peakList
@@ -496,40 +566,8 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
          if restrictGID and gid not in restrictList:
              continue
  
-        featureList = featuresDict[gid]
-        newFeatureList = []
-        isPseudo = False
-        for (ftype, chrom, start, stop, sense) in featureList:
-            if ftype == "PSEUDO":
-                isPseudo = True
-
-            if (start, stop, ftype) not in newFeatureList:
-                notContained = True
-                containedList = []
-                for (fstart, fstop, ftype2) in newFeatureList:
-                    if start >= fstart and stop <= fstop:
-                        notContained = False
-
-                    if start < fstart and stop > fstop:
-                        containedList.append((fstart, fstop))
-
-                if len(containedList) > 0:
-                    newFList = []
-                    notContained = True
-                    for (fstart, fstop, ftype2) in newFeatureList:
-                        if (fstart, fstop) not in containedList:
-                            newFList.append((fstart, fstop, ftype2))
-                            if start >= fstart and stop <= fstop:
-                                notContained = False
-
-                    newFeatureList = newFList
-                if notContained:
-                    newFeatureList.append((start, stop, ftype))
-
-        if ignorePseudo and isPseudo:
-            continue
-
-        if chrom not in featuresByChromDict:
+        newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
+        if newFeatureList and chrom not in featuresByChromDict:
              featuresByChromDict[chrom] = []
  
          for (start, stop, ftype) in newFeatureList:
@@ -562,7 +600,40 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
          return featuresByChromDict
  
  
-def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
+def getNewFeatureList(featureList, ignorePseudo=False):
+    newFeatureList = []
+    for (featureType, chrom, start, stop, sense) in featureList:
+        if featureType == "PSEUDO" and ignorePseudo:
+            return [], chrom, sense
+
+        if (start, stop, featureType) not in newFeatureList:
+            notContained = True
+            containedList = []
+            for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
+                if start >= newFeatureStart and stop <= newFeatureStop:
+                    notContained = False
+
+                if start < newFeatureStart and stop > newFeatureStop:
+                    containedList.append((newFeatureStart, newFeatureStop))
+
+            if len(containedList) > 0:
+                newFList = []
+                notContained = True
+                for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
+                    if (newFeatureStart, newFeatureStop) not in containedList:
+                        newFList.append((newFeatureStart, newFeatureStop, featureType2))
+                        if start >= newFeatureStart and stop <= newFeatureStop:
+                            notContained = False
+
+                newFeatureList = newFList
+
+            if notContained:
+                newFeatureList.append((start, stop, featureType))
+
+    return newFeatureList, chrom, sense
+
+
+def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
                          additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
                          lengthCDS=0, keepSense=False, adjustToNeighbor=True):
      """ return a dictionary of gene loci. Can be used to retrieve additional
@@ -575,46 +646,11 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
      lengthCDS < 0bp, return only the last X bp from CDS.
      """ 
      locusByChromDict = {}
-    if upstream == 0 and downstream == 0 and not useCDS:
-        print "getLocusByChromDict: asked for no sequence - returning empty dict"
-        return locusByChromDict
-    elif upstream > 0 and downstream > 0 and not useCDS:
-        print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS != 0 and not useCDS:
-        print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
-        return locusByChromDict
-    elif upstreamSpanTSS and lengthCDS != 0:
-        print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS > 0 and downstream > 0:
-        print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS < 0 and upstream > 0:
-        print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+    if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
          return locusByChromDict
  
-    genome = genomeObject.genome
-    featuresDict = genomeObject.getallGeneFeatures()
-    if len(additionalRegionsDict) > 0:
-        sortList = []
-        for chrom in additionalRegionsDict:
-            for region in additionalRegionsDict[chrom]:
-                label = region.label
-                if label not in sortList:
-                    sortList.append(label)
-
-                if label not in featuresDict:
-                    featuresDict[label] = []
-                    sense = "+"
-                else:
-                    sense = featuresDict[label][0][-1]
-
-                featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
-
-        for gid in sortList:
-            featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
-
+    genomeName = genome.genome
+    featuresDict = getGeneFeatures(genome, additionalRegionsDict)
      for gid in featuresDict:
          featureList = featuresDict[gid]
          newFeatureList = []
@@ -647,34 +683,28 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
                      distance = upstream
  
                  if adjustToNeighbor:
-                    nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2)
+                    nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
                      if nextGene < distance * 2:
                          distance = nextGene / 2
  
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                  gstart -= distance
  
              if downstream > 0:
                  distance = downstream
                  if adjustToNeighbor:
-                    nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2)
+                    nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
                      if nextGene < downstream * 2:
                          distance = nextGene / 2
  
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                  gstop += distance
  
-            if lengthCDS > 0:
-                if lengthCDS < glen:
-                    gstop = newFeatureList[0][0] + lengthCDS
+            if 0 < lengthCDS < glen:
+                gstop = newFeatureList[0][0] + lengthCDS
  
-            if lengthCDS < 0:
-                if abs(lengthCDS) < glen:
-                    gstart = newFeatureList[-1][1] + lengthCDS
+            if lengthCDS < 0 and abs(lengthCDS) < glen:
+                gstart = newFeatureList[-1][1] + lengthCDS
          else:
              if not useCDS and upstream > 0:
                  if upstreamSpanTSS:
@@ -692,36 +722,29 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
                      distance = upstream
  
                  if adjustToNeighbor:
-                    nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2)
+                    nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
                      if nextGene < distance * 2:
                          distance = nextGene / 2
  
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                  gstop += distance
  
              if downstream > 0:
                  distance = downstream
                  if adjustToNeighbor:
-                    nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2)
+                    nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
                      if nextGene < downstream * 2:
                          distance = nextGene / 2
  
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                  gstart -= distance
  
-            if lengthCDS > 0:
-                if lengthCDS < glen:
-                    gstart = newFeatureList[-1][-1] - lengthCDS
+            if 0 < lengthCDS < glen:
+                gstart = newFeatureList[-1][-1] - lengthCDS
  
-            if lengthCDS < 0:
-                if abs(lengthCDS) < glen:
-                    gstop = newFeatureList[0][0] - lengthCDS
+            if lengthCDS < 0 and abs(lengthCDS) < glen:
+                gstop = newFeatureList[0][0] - lengthCDS
  
-        glen = abs(gstop - gstart)
          if chrom not in locusByChromDict:
              locusByChromDict[chrom] = []
  
@@ -736,6 +759,53 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
      return locusByChromDict
  
  
+def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
+    if upstream == 0 and downstream == 0 and not useCDS:
+        print "getLocusByChromDict: asked for no sequence - returning empty dict"
+        return True
+    elif upstream > 0 and downstream > 0 and not useCDS:
+        print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
+        return True
+    elif lengthCDS != 0 and not useCDS:
+        print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
+        return True
+    elif upstreamSpanTSS and lengthCDS != 0:
+        print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
+        return True
+    elif lengthCDS > 0 and downstream > 0:
+        print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
+        return True
+    elif lengthCDS < 0 and upstream > 0:
+        print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+        return True
+
+    return False
+
+
+def getGeneFeatures(genome, additionalRegionsDict):
+    featuresDict = genome.getallGeneFeatures()
+    if len(additionalRegionsDict) > 0:
+        sortList = []
+        for chrom in additionalRegionsDict:
+            for region in additionalRegionsDict[chrom]:
+                label = region.label
+                if label not in sortList:
+                    sortList.append(label)
+
+                if label not in featuresDict:
+                    featuresDict[label] = []
+                    sense = "+"
+                else:
+                    sense = featuresDict[label][0][-1]
+
+                featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
+
+        for gid in sortList:
+            featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
+
+    return featuresDict
+
+
  def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                        normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
                        binLength=-1):
@@ -798,7 +868,7 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                  rlen = regionTuple[lengthField]
                  try:
                      rsense = regionTuple[senseField]
-                except:
+                except IndexError:
                      rsense = "F"
  
                  if tagStart > stop:
@@ -815,37 +885,25 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                      else:
                          regionBinLength = binLength
  
-                    startdist = tagStart - start
                      if rsense == "F":
-                        # we are relying on python's integer division quirk
-                        binID = startdist / regionBinLength
-                        if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
-                            binID = 0
-                        elif fixedFirstBin > 0:
-                            binID = 1
-
-                        if binID >= bins:
-                            binID = bins - 1
-
-                        try:
-                            regionsBins[regionID][binID] += normalizedTag * weight
-                        except KeyError:
-                            print "%s %s" % (regionID, str(binID))
+                        positionInRegion = tagStart - start
                      else:
-                        rdist = rlen - startdist
-                        binID = rdist / regionBinLength
-                        if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
-                            binID = 0
-                        elif fixedFirstBin > 0:
-                            binID = 1
-
-                        if binID >= bins:
-                            binID = bins - 1
-
-                        try:
-                            regionsBins[regionID][binID] += normalizedTag * weight
-                        except KeyError:
-                            print "%s %s" % (regionID, str(binID))
+                        positionInRegion = start + rlen - tagStart
+
+                    # we are relying on python's integer division quirk
+                    binID = positionInRegion / regionBinLength
+                    if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
+                        binID = 0
+                    elif fixedFirstBin > 0:
+                        binID = 1
+
+                    if binID >= bins:
+                        binID = bins - 1
+
+                    try:
+                        regionsBins[regionID][binID] += normalizedTag * weight
+                    except KeyError:
+                        print "%s %s" % (regionID, str(binID))
  
                      stopPoint = stop