-#
-# commoncode.py
-# ENRAGE
-#
-
import ConfigParser
import os
import string
commoncodeVersion = 5.6
currentRDSversion = 2.0
+class ErangeError(Exception):
+ pass
+
def getReverseComplement(base):
revComp = {"A": "T",
return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
-def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False):
- hg = Genome(genome, inRAM=inRAM)
+def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
+ genome = Genome(genomeName, inRAM=inRAM)
if extendGenome != "":
- hg.extendFeatures(extendGenome, replace=replaceModels)
+ genome.extendFeatures(extendGenome, replace=replaceModels)
- geneannotDict = hg.allAnnotInfo()
+ geneannotDict = genome.allAnnotInfo()
return geneannotDict
"""
if shift == "auto":
- shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
+ shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
topPos = getPeakPositionList(smoothArray, length)
- peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
+ peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
if leftPlus:
numLeftPlus = 0
return peak
-def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
+def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
bestShift = 0
lowestScore = 20000000000
for testShift in xrange(maxShift + 1):
shiftArray = array("f", [0.] * length)
- for read in hitList:
+ for read in readList:
currentpos = read["start"] - start
if read["sense"] == "+":
- currentpos += testShift
+ senseModifier = 1
else:
- currentpos -= testShift
+ senseModifier = -1
+ currentpos += testShift * senseModifier
if (currentpos < 1) or (currentpos >= length):
continue
- if doWeight:
+ if useWeight:
weight = read["weight"]
else:
weight = 1.0
- if read["sense"] == "+":
- shiftArray[currentpos] += weight
- else:
- shiftArray[currentpos] -= weight
+ shiftArray[currentpos] += weight * senseModifier
currentScore = 0
for score in shiftArray:
currentScore += abs(score)
- print currentScore
if currentScore < lowestScore:
bestShift = testShift
lowestScore = currentScore
hitIndex += 1
currentpos += 1
- while hitIndex < readlen and currentpos < length:
+ while hitIndex < readlen and currentpos < length:
seqArray[currentpos] += weight
hitIndex += 1
currentpos += 1
if topNucleotide < smoothArray[currentpos]:
topNucleotide = smoothArray[currentpos]
peakList = [currentpos]
- elif topNucleotide == smoothArray[currentpos]:
+ elif topNucleotide == smoothArray[currentpos]:
peakList.append(currentpos)
return peakList
if restrictGID and gid not in restrictList:
continue
- featureList = featuresDict[gid]
- newFeatureList = []
- isPseudo = False
- for (ftype, chrom, start, stop, sense) in featureList:
- if ftype == "PSEUDO":
- isPseudo = True
-
- if (start, stop, ftype) not in newFeatureList:
- notContained = True
- containedList = []
- for (fstart, fstop, ftype2) in newFeatureList:
- if start >= fstart and stop <= fstop:
- notContained = False
-
- if start < fstart and stop > fstop:
- containedList.append((fstart, fstop))
-
- if len(containedList) > 0:
- newFList = []
- notContained = True
- for (fstart, fstop, ftype2) in newFeatureList:
- if (fstart, fstop) not in containedList:
- newFList.append((fstart, fstop, ftype2))
- if start >= fstart and stop <= fstop:
- notContained = False
-
- newFeatureList = newFList
- if notContained:
- newFeatureList.append((start, stop, ftype))
-
- if ignorePseudo and isPseudo:
- continue
-
- if chrom not in featuresByChromDict:
+ newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
+ if newFeatureList and chrom not in featuresByChromDict:
featuresByChromDict[chrom] = []
for (start, stop, ftype) in newFeatureList:
return featuresByChromDict
-def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
+def getNewFeatureList(featureList, ignorePseudo=False):
+ newFeatureList = []
+ for (featureType, chrom, start, stop, sense) in featureList:
+ if featureType == "PSEUDO" and ignorePseudo:
+ return [], chrom, sense
+
+ if (start, stop, featureType) not in newFeatureList:
+ notContained = True
+ containedList = []
+ for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
+ if start >= newFeatureStart and stop <= newFeatureStop:
+ notContained = False
+
+ if start < newFeatureStart and stop > newFeatureStop:
+ containedList.append((newFeatureStart, newFeatureStop))
+
+ if len(containedList) > 0:
+ newFList = []
+ notContained = True
+ for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
+ if (newFeatureStart, newFeatureStop) not in containedList:
+ newFList.append((newFeatureStart, newFeatureStop, featureType2))
+ if start >= newFeatureStart and stop <= newFeatureStop:
+ notContained = False
+
+ newFeatureList = newFList
+
+ if notContained:
+ newFeatureList.append((start, stop, featureType))
+
+ return newFeatureList, chrom, sense
+
+
+def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
lengthCDS=0, keepSense=False, adjustToNeighbor=True):
""" return a dictionary of gene loci. Can be used to retrieve additional
lengthCDS < 0bp, return only the last X bp from CDS.
"""
locusByChromDict = {}
- if upstream == 0 and downstream == 0 and not useCDS:
- print "getLocusByChromDict: asked for no sequence - returning empty dict"
+ if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
return locusByChromDict
- elif upstream > 0 and downstream > 0 and not useCDS:
- print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
- return locusByChromDict
- elif lengthCDS != 0 and not useCDS:
- print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
- return locusByChromDict
- elif upstreamSpanTSS and lengthCDS != 0:
- print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
- return locusByChromDict
- elif lengthCDS > 0 and downstream > 0:
- print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
- return locusByChromDict
- elif lengthCDS < 0 and upstream > 0:
- print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
- return locusByChromDict
-
- genome = genomeObject.genome
- featuresDict = genomeObject.getallGeneFeatures()
- if len(additionalRegionsDict) > 0:
- sortList = []
- for chrom in additionalRegionsDict:
- for region in additionalRegionsDict[chrom]:
- label = region.label
- if label not in sortList:
- sortList.append(label)
-
- if label not in featuresDict:
- featuresDict[label] = []
- sense = "+"
- else:
- sense = featuresDict[label][0][-1]
-
- featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
-
- for gid in sortList:
- featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
+ genomeName = genome.genome
+ featuresDict = getGeneFeatures(genome, additionalRegionsDict)
for gid in featuresDict:
featureList = featuresDict[gid]
newFeatureList = []
distance = upstream
if adjustToNeighbor:
- nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2)
+ nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
if nextGene < distance * 2:
distance = nextGene / 2
- if distance < 1:
- distance = 1
-
+ distance = max(distance, 1)
gstart -= distance
if downstream > 0:
distance = downstream
if adjustToNeighbor:
- nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2)
+ nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
if nextGene < downstream * 2:
distance = nextGene / 2
- if distance < 1:
- distance = 1
-
+ distance = max(distance, 1)
gstop += distance
- if lengthCDS > 0:
- if lengthCDS < glen:
- gstop = newFeatureList[0][0] + lengthCDS
+ if 0 < lengthCDS < glen:
+ gstop = newFeatureList[0][0] + lengthCDS
- if lengthCDS < 0:
- if abs(lengthCDS) < glen:
- gstart = newFeatureList[-1][1] + lengthCDS
+ if lengthCDS < 0 and abs(lengthCDS) < glen:
+ gstart = newFeatureList[-1][1] + lengthCDS
else:
if not useCDS and upstream > 0:
if upstreamSpanTSS:
distance = upstream
if adjustToNeighbor:
- nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2)
+ nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
if nextGene < distance * 2:
distance = nextGene / 2
- if distance < 1:
- distance = 1
-
+ distance = max(distance, 1)
gstop += distance
if downstream > 0:
distance = downstream
if adjustToNeighbor:
- nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2)
+ nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
if nextGene < downstream * 2:
distance = nextGene / 2
- if distance < 1:
- distance = 1
-
+ distance = max(distance, 1)
gstart -= distance
- if lengthCDS > 0:
- if lengthCDS < glen:
- gstart = newFeatureList[-1][-1] - lengthCDS
+ if 0 < lengthCDS < glen:
+ gstart = newFeatureList[-1][-1] - lengthCDS
- if lengthCDS < 0:
- if abs(lengthCDS) < glen:
- gstop = newFeatureList[0][0] - lengthCDS
+ if lengthCDS < 0 and abs(lengthCDS) < glen:
+ gstop = newFeatureList[0][0] - lengthCDS
- glen = abs(gstop - gstart)
if chrom not in locusByChromDict:
locusByChromDict[chrom] = []
return locusByChromDict
+def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
+ if upstream == 0 and downstream == 0 and not useCDS:
+ print "getLocusByChromDict: asked for no sequence - returning empty dict"
+ return True
+ elif upstream > 0 and downstream > 0 and not useCDS:
+ print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
+ return True
+ elif lengthCDS != 0 and not useCDS:
+ print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
+ return True
+ elif upstreamSpanTSS and lengthCDS != 0:
+ print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
+ return True
+ elif lengthCDS > 0 and downstream > 0:
+ print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
+ return True
+ elif lengthCDS < 0 and upstream > 0:
+ print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+ return True
+
+ return False
+
+
+def getGeneFeatures(genome, additionalRegionsDict):
+ featuresDict = genome.getallGeneFeatures()
+ if len(additionalRegionsDict) > 0:
+ sortList = []
+ for chrom in additionalRegionsDict:
+ for region in additionalRegionsDict[chrom]:
+ label = region.label
+ if label not in sortList:
+ sortList.append(label)
+
+ if label not in featuresDict:
+ featuresDict[label] = []
+ sense = "+"
+ else:
+ sense = featuresDict[label][0][-1]
+
+ featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
+
+ for gid in sortList:
+ featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
+
+ return featuresDict
+
+
def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
binLength=-1):
rlen = regionTuple[lengthField]
try:
rsense = regionTuple[senseField]
- except:
+ except IndexError:
rsense = "F"
if tagStart > stop:
else:
regionBinLength = binLength
- startdist = tagStart - start
if rsense == "F":
- # we are relying on python's integer division quirk
- binID = startdist / regionBinLength
- if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
- binID = 0
- elif fixedFirstBin > 0:
- binID = 1
-
- if binID >= bins:
- binID = bins - 1
-
- try:
- regionsBins[regionID][binID] += normalizedTag * weight
- except KeyError:
- print "%s %s" % (regionID, str(binID))
+ positionInRegion = tagStart - start
else:
- rdist = rlen - startdist
- binID = rdist / regionBinLength
- if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
- binID = 0
- elif fixedFirstBin > 0:
- binID = 1
-
- if binID >= bins:
- binID = bins - 1
-
- try:
- regionsBins[regionID][binID] += normalizedTag * weight
- except KeyError:
- print "%s %s" % (regionID, str(binID))
+ positionInRegion = start + rlen - tagStart
+
+ # we are relying on python's integer division quirk
+ binID = positionInRegion / regionBinLength
+ if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
+ binID = 0
+ elif fixedFirstBin > 0:
+ binID = 1
+
+ if binID >= bins:
+ binID = bins - 1
+
+ try:
+ regionsBins[regionID][binID] += normalizedTag * weight
+ except KeyError:
+ print "%s %s" % (regionID, str(binID))
stopPoint = stop