-#
-# commoncode.py
-# ENRAGE
-#
-
import ConfigParser
import os
import string
import Region
commoncodeVersion = 5.6
-currentRDSversion = 2.0
+#TODO: This is a function dumping ground - needs to be reworked
class ErangeError(Exception):
pass
logfile.close()
+def getChromInfoList(chrominfo):
+ chromInfoList = []
+ linelist = open(chrominfo)
+ for line in linelist:
+ fields = line.strip().split('\t')
+ chr = fields[0]
+ start = 0
+ end = int(fields[1])
+ chromInfoList.append((chr,start,end))
+
+ linelist.close()
+
+ return chromInfoList
+
+
+# BAM file support functions
+def isSpliceEntry(cigarTupleList):
+ isSplice = False
+ if cigarTupleList is not None:
+ for operation,length in cigarTupleList:
+ if operation == 3:
+ isSplice = True
+ break
+
+ return isSplice
+
+
+def addPairIDtoReadID(alignedread):
+ ID = alignedread.qname
+ if alignedread.is_read1:
+ ID = ID + '/1'
+
+ if alignedread.is_read2:
+ ID = ID + '/2'
+
+ return ID
+
+
+def getHeaderComment(bamHeader, commentKey):
+ for comment in bamHeader["CO"]:
+ fields = comment.split("\t")
+ if fields[0] == commentKey:
+ return fields[1]
+
+ raise KeyError
+
+
+def getReadSense(read):
+ if read.is_reverse:
+ sense = "-"
+ else:
+ sense = "+"
+
+ return sense
+
+#TODO: is this where we need to take into account the NH > 10 restriction?
+def countThisRead(read, countUniqs, countMulti, countSplices, sense):
+ countRead = True
+
+ if sense in ['-', '+'] and sense != getReadSense(read):
+ countRead = False
+ elif not countSplices and isSpliceEntry(read.cigar):
+ countRead = False
+ elif not countUniqs and read.opt('NH') == 1:
+ countRead = False
+ elif not countMulti and read.opt('NH') > 1:
+ countRead = False
+
+ return countRead
+
+
+# Cistematic functions
def getGeneInfoDict(genome, cache=False):
idb = geneinfoDB(cache=cache)
if genome == "dmelanogaster":
return geneannotDict
+# Configuration File Support
def getConfigParser(fileList=[]):
configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
for filename in fileList:
return setting
+# All the Other Stuff from before
def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
doMerge=True, keepPeak=False, returnTop=0):
mergeCount = 0
chromField = int(chromField)
count = 0
+ #TODO: Current algorithm processes input file line by line and compares with prior lines. Problem is it
+ # exits at the first merge. This is not a problem when the input is sorted by start position, but in
+ # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
+ # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will
+ # be merged with A but that will exit the loop and the merge with C will be missed.
+ #TODO: Are we going to require sorted region files to have this even work?
for regionEntry in regionList:
if regionEntry[0] == "#":
if "pvalue" in regionEntry:
for read in readList:
currentpos = read["start"] - start
if read["sense"] == "+":
- currentpos += testShift
+ senseModifier = 1
else:
- currentpos -= testShift
+ senseModifier = -1
+ currentpos += testShift * senseModifier
if (currentpos < 1) or (currentpos >= length):
continue
else:
weight = 1.0
- if read["sense"] == "+":
- shiftArray[currentpos] += weight
- else:
- shiftArray[currentpos] -= weight
+ shiftArray[currentpos] += weight * senseModifier
currentScore = 0
for score in shiftArray:
currentScore += abs(score)
- print currentScore
if currentScore < lowestScore:
bestShift = testShift
lowestScore = currentScore
if restrictGID and gid not in restrictList:
continue
- featureList = featuresDict[gid]
- newFeatureList = []
- isPseudo = False
- for (ftype, chrom, start, stop, sense) in featureList:
- if ftype == "PSEUDO":
- isPseudo = True
-
- if (start, stop, ftype) not in newFeatureList:
- notContained = True
- containedList = []
- for (fstart, fstop, ftype2) in newFeatureList:
- if start >= fstart and stop <= fstop:
- notContained = False
-
- if start < fstart and stop > fstop:
- containedList.append((fstart, fstop))
-
- if len(containedList) > 0:
- newFList = []
- notContained = True
- for (fstart, fstop, ftype2) in newFeatureList:
- if (fstart, fstop) not in containedList:
- newFList.append((fstart, fstop, ftype2))
- if start >= fstart and stop <= fstop:
- notContained = False
-
- newFeatureList = newFList
- if notContained:
- newFeatureList.append((start, stop, ftype))
-
- if ignorePseudo and isPseudo:
- continue
-
- if chrom not in featuresByChromDict:
+ newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
+ if newFeatureList and chrom not in featuresByChromDict:
featuresByChromDict[chrom] = []
for (start, stop, ftype) in newFeatureList:
return featuresByChromDict
+def getNewFeatureList(featureList, ignorePseudo=False):
+ newFeatureList = []
+ for (featureType, chrom, start, stop, sense) in featureList:
+ if featureType == "PSEUDO" and ignorePseudo:
+ return [], chrom, sense
+
+ if (start, stop, featureType) not in newFeatureList:
+ notContained = True
+ containedList = []
+ for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
+ if start >= newFeatureStart and stop <= newFeatureStop:
+ notContained = False
+
+ if start < newFeatureStart and stop > newFeatureStop:
+ containedList.append((newFeatureStart, newFeatureStop))
+
+ if len(containedList) > 0:
+ newFList = []
+ notContained = True
+ for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
+ if (newFeatureStart, newFeatureStop) not in containedList:
+ newFList.append((newFeatureStart, newFeatureStop, featureType2))
+ if start >= newFeatureStart and stop <= newFeatureStop:
+ notContained = False
+
+ newFeatureList = newFList
+
+ if notContained:
+ newFeatureList.append((start, stop, featureType))
+
+ return newFeatureList, chrom, sense
+
+
def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
lengthCDS=0, keepSense=False, adjustToNeighbor=True):
lengthCDS < 0bp, return only the last X bp from CDS.
"""
locusByChromDict = {}
- if upstream == 0 and downstream == 0 and not useCDS:
- print "getLocusByChromDict: asked for no sequence - returning empty dict"
- return locusByChromDict
- elif upstream > 0 and downstream > 0 and not useCDS:
- print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
- return locusByChromDict
- elif lengthCDS != 0 and not useCDS:
- print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
- return locusByChromDict
- elif upstreamSpanTSS and lengthCDS != 0:
- print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
- return locusByChromDict
- elif lengthCDS > 0 and downstream > 0:
- print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
- return locusByChromDict
- elif lengthCDS < 0 and upstream > 0:
- print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+ if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
return locusByChromDict
genomeName = genome.genome
return locusByChromDict
+def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
+ if upstream == 0 and downstream == 0 and not useCDS:
+ print "getLocusByChromDict: asked for no sequence - returning empty dict"
+ return True
+ elif upstream > 0 and downstream > 0 and not useCDS:
+ print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
+ return True
+ elif lengthCDS != 0 and not useCDS:
+ print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
+ return True
+ elif upstreamSpanTSS and lengthCDS != 0:
+ print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
+ return True
+ elif lengthCDS > 0 and downstream > 0:
+ print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
+ return True
+ elif lengthCDS < 0 and upstream > 0:
+ print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+ return True
+
+ return False
+
+
def getGeneFeatures(genome, additionalRegionsDict):
featuresDict = genome.getallGeneFeatures()
if len(additionalRegionsDict) > 0:
else:
regionBinLength = binLength
- startdist = tagStart - start
if rsense == "F":
- # we are relying on python's integer division quirk
- binID = startdist / regionBinLength
- if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
- binID = 0
- elif fixedFirstBin > 0:
- binID = 1
-
- if binID >= bins:
- binID = bins - 1
-
- try:
- regionsBins[regionID][binID] += normalizedTag * weight
- except KeyError:
- print "%s %s" % (regionID, str(binID))
+ positionInRegion = tagStart - start
else:
- rdist = rlen - startdist
- binID = rdist / regionBinLength
- if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
- binID = 0
- elif fixedFirstBin > 0:
- binID = 1
-
- if binID >= bins:
- binID = bins - 1
-
- try:
- regionsBins[regionID][binID] += normalizedTag * weight
- except KeyError:
- print "%s %s" % (regionID, str(binID))
+ positionInRegion = start + rlen - tagStart
+
+ # we are relying on python's integer division quirk
+ binID = positionInRegion / regionBinLength
+ if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
+ binID = 0
+ elif fixedFirstBin > 0:
+ binID = 1
+
+ if binID >= bins:
+ binID = bins - 1
+
+ try:
+ regionsBins[regionID][binID] += normalizedTag * weight
+ except KeyError:
+ print "%s %s" % (regionID, str(binID))
stopPoint = stop