Release version for Erange 4.0a
[erange.git] / commoncode.py
index 821936a30bc2524f0aa1664de1a66ecb8ae47fd6..f526daacca0dc91245e71205363216bc17eba2e9 100755 (executable)
@@ -17,6 +17,9 @@ import Region
 commoncodeVersion = 5.6
 currentRDSversion = 2.0
 
+class ErangeError(Exception):
+    pass
+
 
 def getReverseComplement(base):
     revComp = {"A": "T",
@@ -65,12 +68,12 @@ def getGeneAnnotDict(genome, inRAM=False):
     return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
 
 
-def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False):
-    hg = Genome(genome, inRAM=inRAM)
+def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
+    genome = Genome(genomeName, inRAM=inRAM)
     if extendGenome != "":
-        hg.extendFeatures(extendGenome, replace=replaceModels)
+        genome.extendFeatures(extendGenome, replace=replaceModels)
 
-    geneannotDict = hg.allAnnotInfo()
+    geneannotDict = genome.allAnnotInfo()
 
     return geneannotDict
 
@@ -189,11 +192,6 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
     mergeCount = 0
     chromField = int(chromField)
     count = 0
-    #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
-    #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
-    #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
-    #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
-    #      be merged with A but that will exit the loop and the merge with C will be missed.
     for regionEntry in regionList:
         if regionEntry[0] == "#":
             if "pvalue" in regionEntry:
@@ -336,7 +334,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
     """
 
     if shift == "auto":
-        shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
+        shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
 
     seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
 
@@ -346,7 +344,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
 
     topPos = getPeakPositionList(smoothArray, length)
-    peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
+    peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
 
     if leftPlus:
         numLeftPlus = 0
@@ -366,12 +364,12 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
     return peak
 
 
-def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
+def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
     bestShift = 0
     lowestScore = 20000000000
     for testShift in xrange(maxShift + 1):
         shiftArray = array("f", [0.] * length)
-        for read in hitList:
+        for read in readList:
             currentpos = read["start"] - start
             if read["sense"] == "+":
                 currentpos += testShift
@@ -381,7 +379,7 @@ def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
             if (currentpos < 1) or (currentpos >= length):
                 continue
 
-            if doWeight:
+            if useWeight:
                 weight = read["weight"]
             else:
                 weight = 1.0
@@ -432,7 +430,7 @@ def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, left
             hitIndex += 1
             currentpos += 1
 
-        while hitIndex < readlen and  currentpos < length:
+        while hitIndex < readlen and currentpos < length:
             seqArray[currentpos] += weight
             hitIndex += 1
             currentpos += 1
@@ -450,7 +448,7 @@ def getPeakPositionList(smoothArray, length):
         if topNucleotide < smoothArray[currentpos]:
             topNucleotide = smoothArray[currentpos]
             peakList = [currentpos]
-        elif topNucleotide  == smoothArray[currentpos]:
+        elif topNucleotide == smoothArray[currentpos]:
             peakList.append(currentpos)
 
     return peakList
@@ -562,7 +560,7 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
         return featuresByChromDict
 
 
-def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
+def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
     """ return a dictionary of gene loci. Can be used to retrieve additional
@@ -594,27 +592,8 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
         print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
         return locusByChromDict
 
-    genome = genomeObject.genome
-    featuresDict = genomeObject.getallGeneFeatures()
-    if len(additionalRegionsDict) > 0:
-        sortList = []
-        for chrom in additionalRegionsDict:
-            for region in additionalRegionsDict[chrom]:
-                label = region.label
-                if label not in sortList:
-                    sortList.append(label)
-
-                if label not in featuresDict:
-                    featuresDict[label] = []
-                    sense = "+"
-                else:
-                    sense = featuresDict[label][0][-1]
-
-                featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
-
-        for gid in sortList:
-            featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
-
+    genomeName = genome.genome
+    featuresDict = getGeneFeatures(genome, additionalRegionsDict)
     for gid in featuresDict:
         featureList = featuresDict[gid]
         newFeatureList = []
@@ -647,34 +626,28 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
                     distance = upstream
 
                 if adjustToNeighbor:
-                    nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2)
+                    nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
                     if nextGene < distance * 2:
                         distance = nextGene / 2
 
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                 gstart -= distance
 
             if downstream > 0:
                 distance = downstream
                 if adjustToNeighbor:
-                    nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2)
+                    nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
                     if nextGene < downstream * 2:
                         distance = nextGene / 2
 
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                 gstop += distance
 
-            if lengthCDS > 0:
-                if lengthCDS < glen:
-                    gstop = newFeatureList[0][0] + lengthCDS
+            if 0 < lengthCDS < glen:
+                gstop = newFeatureList[0][0] + lengthCDS
 
-            if lengthCDS < 0:
-                if abs(lengthCDS) < glen:
-                    gstart = newFeatureList[-1][1] + lengthCDS
+            if lengthCDS < 0 and abs(lengthCDS) < glen:
+                gstart = newFeatureList[-1][1] + lengthCDS
         else:
             if not useCDS and upstream > 0:
                 if upstreamSpanTSS:
@@ -692,36 +665,29 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
                     distance = upstream
 
                 if adjustToNeighbor:
-                    nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2)
+                    nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
                     if nextGene < distance * 2:
                         distance = nextGene / 2
 
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                 gstop += distance
 
             if downstream > 0:
                 distance = downstream
                 if adjustToNeighbor:
-                    nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2)
+                    nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
                     if nextGene < downstream * 2:
                         distance = nextGene / 2
 
-                if distance < 1:
-                    distance = 1
-
+                distance = max(distance, 1)
                 gstart -= distance
 
-            if lengthCDS > 0:
-                if lengthCDS < glen:
-                    gstart = newFeatureList[-1][-1] - lengthCDS
+            if 0 < lengthCDS < glen:
+                gstart = newFeatureList[-1][-1] - lengthCDS
 
-            if lengthCDS < 0:
-                if abs(lengthCDS) < glen:
-                    gstop = newFeatureList[0][0] - lengthCDS
+            if lengthCDS < 0 and abs(lengthCDS) < glen:
+                gstop = newFeatureList[0][0] - lengthCDS
 
-        glen = abs(gstop - gstart)
         if chrom not in locusByChromDict:
             locusByChromDict[chrom] = []
 
@@ -736,6 +702,30 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
     return locusByChromDict
 
 
+def getGeneFeatures(genome, additionalRegionsDict):
+    featuresDict = genome.getallGeneFeatures()
+    if len(additionalRegionsDict) > 0:
+        sortList = []
+        for chrom in additionalRegionsDict:
+            for region in additionalRegionsDict[chrom]:
+                label = region.label
+                if label not in sortList:
+                    sortList.append(label)
+
+                if label not in featuresDict:
+                    featuresDict[label] = []
+                    sense = "+"
+                else:
+                    sense = featuresDict[label][0][-1]
+
+                featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
+
+        for gid in sortList:
+            featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
+
+    return featuresDict
+
+
 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                       normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
                       binLength=-1):
@@ -798,7 +788,7 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                 rlen = regionTuple[lengthField]
                 try:
                     rsense = regionTuple[senseField]
-                except:
+                except IndexError:
                     rsense = "F"
 
                 if tagStart > stop: