rewrite of findall.py and MakeRdsFromBam to fix bugs resulting from poor initial...

[erange.git] / commoncode.py
diff --git a/commoncode.py b/commoncode.py

index f526daacca0dc91245e71205363216bc17eba2e9..6ef4edfcb3e10d75ae0eecea887f45581f9e4edb 100755 (executable)
--- a/commoncode.py
+++ b/commoncode.py
@@ -1,8 +1,3 @@
-#
-#  commoncode.py
-#  ENRAGE
-#
-
  import ConfigParser
  import os
  import string
@@ -192,6 +187,11 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
      mergeCount = 0
      chromField = int(chromField)
      count = 0
+    #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
+    #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
+    #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
+    #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
+    #      be merged with A but that will exit the loop and the merge with C will be missed.
      for regionEntry in regionList:
          if regionEntry[0] == "#":
              if "pvalue" in regionEntry:
@@ -372,10 +372,11 @@ def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75)
          for read in readList:
              currentpos = read["start"] - start
              if read["sense"] == "+":
-                currentpos += testShift
+                senseModifier = 1
              else:
-                currentpos -= testShift
+                senseModifier = -1
  
+            currentpos += testShift * senseModifier
              if (currentpos < 1) or (currentpos >= length):
                  continue
  
@@ -384,16 +385,12 @@ def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75)
              else:
                  weight = 1.0
  
-            if read["sense"] == "+":
-                shiftArray[currentpos] += weight
-            else:
-                shiftArray[currentpos] -= weight
+            shiftArray[currentpos] += weight * senseModifier
  
          currentScore = 0
          for score in shiftArray:
              currentScore += abs(score)
  
-        print currentScore
          if currentScore < lowestScore:
              bestShift = testShift
              lowestScore = currentScore
@@ -494,40 +491,8 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
          if restrictGID and gid not in restrictList:
              continue
  
-        featureList = featuresDict[gid]
-        newFeatureList = []
-        isPseudo = False
-        for (ftype, chrom, start, stop, sense) in featureList:
-            if ftype == "PSEUDO":
-                isPseudo = True
-
-            if (start, stop, ftype) not in newFeatureList:
-                notContained = True
-                containedList = []
-                for (fstart, fstop, ftype2) in newFeatureList:
-                    if start >= fstart and stop <= fstop:
-                        notContained = False
-
-                    if start < fstart and stop > fstop:
-                        containedList.append((fstart, fstop))
-
-                if len(containedList) > 0:
-                    newFList = []
-                    notContained = True
-                    for (fstart, fstop, ftype2) in newFeatureList:
-                        if (fstart, fstop) not in containedList:
-                            newFList.append((fstart, fstop, ftype2))
-                            if start >= fstart and stop <= fstop:
-                                notContained = False
-
-                    newFeatureList = newFList
-                if notContained:
-                    newFeatureList.append((start, stop, ftype))
-
-        if ignorePseudo and isPseudo:
-            continue
-
-        if chrom not in featuresByChromDict:
+        newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
+        if newFeatureList and chrom not in featuresByChromDict:
              featuresByChromDict[chrom] = []
  
          for (start, stop, ftype) in newFeatureList:
@@ -560,6 +525,39 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
          return featuresByChromDict
  
  
+def getNewFeatureList(featureList, ignorePseudo=False):
+    newFeatureList = []
+    for (featureType, chrom, start, stop, sense) in featureList:
+        if featureType == "PSEUDO" and ignorePseudo:
+            return [], chrom, sense
+
+        if (start, stop, featureType) not in newFeatureList:
+            notContained = True
+            containedList = []
+            for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
+                if start >= newFeatureStart and stop <= newFeatureStop:
+                    notContained = False
+
+                if start < newFeatureStart and stop > newFeatureStop:
+                    containedList.append((newFeatureStart, newFeatureStop))
+
+            if len(containedList) > 0:
+                newFList = []
+                notContained = True
+                for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
+                    if (newFeatureStart, newFeatureStop) not in containedList:
+                        newFList.append((newFeatureStart, newFeatureStop, featureType2))
+                        if start >= newFeatureStart and stop <= newFeatureStop:
+                            notContained = False
+
+                newFeatureList = newFList
+
+            if notContained:
+                newFeatureList.append((start, stop, featureType))
+
+    return newFeatureList, chrom, sense
+
+
  def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
                          additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
                          lengthCDS=0, keepSense=False, adjustToNeighbor=True):
@@ -573,23 +571,7 @@ def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
      lengthCDS < 0bp, return only the last X bp from CDS.
      """ 
      locusByChromDict = {}
-    if upstream == 0 and downstream == 0 and not useCDS:
-        print "getLocusByChromDict: asked for no sequence - returning empty dict"
-        return locusByChromDict
-    elif upstream > 0 and downstream > 0 and not useCDS:
-        print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS != 0 and not useCDS:
-        print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
-        return locusByChromDict
-    elif upstreamSpanTSS and lengthCDS != 0:
-        print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS > 0 and downstream > 0:
-        print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS < 0 and upstream > 0:
-        print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+    if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
          return locusByChromDict
  
      genomeName = genome.genome
@@ -702,6 +684,29 @@ def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
      return locusByChromDict
  
  
+def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
+    if upstream == 0 and downstream == 0 and not useCDS:
+        print "getLocusByChromDict: asked for no sequence - returning empty dict"
+        return True
+    elif upstream > 0 and downstream > 0 and not useCDS:
+        print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
+        return True
+    elif lengthCDS != 0 and not useCDS:
+        print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
+        return True
+    elif upstreamSpanTSS and lengthCDS != 0:
+        print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
+        return True
+    elif lengthCDS > 0 and downstream > 0:
+        print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
+        return True
+    elif lengthCDS < 0 and upstream > 0:
+        print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+        return True
+
+    return False
+
+
  def getGeneFeatures(genome, additionalRegionsDict):
      featuresDict = genome.getallGeneFeatures()
      if len(additionalRegionsDict) > 0:
@@ -805,37 +810,25 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                      else:
                          regionBinLength = binLength
  
-                    startdist = tagStart - start
                      if rsense == "F":
-                        # we are relying on python's integer division quirk
-                        binID = startdist / regionBinLength
-                        if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
-                            binID = 0
-                        elif fixedFirstBin > 0:
-                            binID = 1
-
-                        if binID >= bins:
-                            binID = bins - 1
-
-                        try:
-                            regionsBins[regionID][binID] += normalizedTag * weight
-                        except KeyError:
-                            print "%s %s" % (regionID, str(binID))
+                        positionInRegion = tagStart - start
                      else:
-                        rdist = rlen - startdist
-                        binID = rdist / regionBinLength
-                        if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
-                            binID = 0
-                        elif fixedFirstBin > 0:
-                            binID = 1
-
-                        if binID >= bins:
-                            binID = bins - 1
-
-                        try:
-                            regionsBins[regionID][binID] += normalizedTag * weight
-                        except KeyError:
-                            print "%s %s" % (regionID, str(binID))
+                        positionInRegion = start + rlen - tagStart
+
+                    # we are relying on python's integer division quirk
+                    binID = positionInRegion / regionBinLength
+                    if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
+                        binID = 0
+                    elif fixedFirstBin > 0:
+                        binID = 1
+
+                    if binID >= bins:
+                        binID = bins - 1
+
+                    try:
+                        regionsBins[regionID][binID] += normalizedTag * weight
+                    except KeyError:
+                        print "%s %s" % (regionID, str(binID))
  
                      stopPoint = stop