X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=commoncode.py;fp=commoncode.py;h=6ef4edfcb3e10d75ae0eecea887f45581f9e4edb;hp=f526daacca0dc91245e71205363216bc17eba2e9;hb=03f1e0b3bab22d517ad75b9af4d54e8fcb8540fb;hpb=b54eee5365a0fad35d2f6168eaac82ff5a359222 diff --git a/commoncode.py b/commoncode.py index f526daa..6ef4edf 100755 --- a/commoncode.py +++ b/commoncode.py @@ -1,8 +1,3 @@ -# -# commoncode.py -# ENRAGE -# - import ConfigParser import os import string @@ -192,6 +187,11 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, mergeCount = 0 chromField = int(chromField) count = 0 + #TODO: Current algorithm processes input file line by line and compares with prior lines. Problem is it + # exits at the first merge. This is not a problem when the input is sorted by start position, but in + # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there + # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will + # be merged with A but that will exit the loop and the merge with C will be missed. for regionEntry in regionList: if regionEntry[0] == "#": if "pvalue" in regionEntry: @@ -372,10 +372,11 @@ def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75) for read in readList: currentpos = read["start"] - start if read["sense"] == "+": - currentpos += testShift + senseModifier = 1 else: - currentpos -= testShift + senseModifier = -1 + currentpos += testShift * senseModifier if (currentpos < 1) or (currentpos >= length): continue @@ -384,16 +385,12 @@ def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75) else: weight = 1.0 - if read["sense"] == "+": - shiftArray[currentpos] += weight - else: - shiftArray[currentpos] -= weight + shiftArray[currentpos] += weight * senseModifier currentScore = 0 for score in shiftArray: currentScore += abs(score) - print currentScore if currentScore < lowestScore: bestShift = testShift lowestScore = currentScore @@ -494,40 +491,8 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo= if restrictGID and gid not in restrictList: continue - featureList = featuresDict[gid] - newFeatureList = [] - isPseudo = False - for (ftype, chrom, start, stop, sense) in featureList: - if ftype == "PSEUDO": - isPseudo = True - - if (start, stop, ftype) not in newFeatureList: - notContained = True - containedList = [] - for (fstart, fstop, ftype2) in newFeatureList: - if start >= fstart and stop <= fstop: - notContained = False - - if start < fstart and stop > fstop: - containedList.append((fstart, fstop)) - - if len(containedList) > 0: - newFList = [] - notContained = True - for (fstart, fstop, ftype2) in newFeatureList: - if (fstart, fstop) not in containedList: - newFList.append((fstart, fstop, ftype2)) - if start >= fstart and stop <= fstop: - notContained = False - - newFeatureList = newFList - if notContained: - newFeatureList.append((start, stop, ftype)) - - if ignorePseudo and isPseudo: - continue - - if chrom not in featuresByChromDict: + newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid]) + if newFeatureList and chrom not in featuresByChromDict: featuresByChromDict[chrom] = [] for (start, stop, ftype) in newFeatureList: @@ -560,6 +525,39 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo= return featuresByChromDict +def getNewFeatureList(featureList, ignorePseudo=False): + newFeatureList = [] + for (featureType, chrom, start, stop, sense) in featureList: + if featureType == "PSEUDO" and ignorePseudo: + return [], chrom, sense + + if (start, stop, featureType) not in newFeatureList: + notContained = True + containedList = [] + for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList: + if start >= newFeatureStart and stop <= newFeatureStop: + notContained = False + + if start < newFeatureStart and stop > newFeatureStop: + containedList.append((newFeatureStart, newFeatureStop)) + + if len(containedList) > 0: + newFList = [] + notContained = True + for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList: + if (newFeatureStart, newFeatureStop) not in containedList: + newFList.append((newFeatureStart, newFeatureStop, featureType2)) + if start >= newFeatureStart and stop <= newFeatureStop: + notContained = False + + newFeatureList = newFList + + if notContained: + newFeatureList.append((start, stop, featureType)) + + return newFeatureList, chrom, sense + + def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True, additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False, lengthCDS=0, keepSense=False, adjustToNeighbor=True): @@ -573,23 +571,7 @@ def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True, lengthCDS < 0bp, return only the last X bp from CDS. """ locusByChromDict = {} - if upstream == 0 and downstream == 0 and not useCDS: - print "getLocusByChromDict: asked for no sequence - returning empty dict" - return locusByChromDict - elif upstream > 0 and downstream > 0 and not useCDS: - print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict" - return locusByChromDict - elif lengthCDS != 0 and not useCDS: - print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict" - return locusByChromDict - elif upstreamSpanTSS and lengthCDS != 0: - print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict" - return locusByChromDict - elif lengthCDS > 0 and downstream > 0: - print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict" - return locusByChromDict - elif lengthCDS < 0 and upstream > 0: - print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict" + if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS): return locusByChromDict genomeName = genome.genome @@ -702,6 +684,29 @@ def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True, return locusByChromDict +def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS): + if upstream == 0 and downstream == 0 and not useCDS: + print "getLocusByChromDict: asked for no sequence - returning empty dict" + return True + elif upstream > 0 and downstream > 0 and not useCDS: + print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict" + return True + elif lengthCDS != 0 and not useCDS: + print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict" + return True + elif upstreamSpanTSS and lengthCDS != 0: + print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict" + return True + elif lengthCDS > 0 and downstream > 0: + print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict" + return True + elif lengthCDS < 0 and upstream > 0: + print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict" + return True + + return False + + def getGeneFeatures(genome, additionalRegionsDict): featuresDict = genome.getallGeneFeatures() if len(additionalRegionsDict) > 0: @@ -805,37 +810,25 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[], else: regionBinLength = binLength - startdist = tagStart - start if rsense == "F": - # we are relying on python's integer division quirk - binID = startdist / regionBinLength - if (fixedFirstBin > 0) and (startdist < fixedFirstBin): - binID = 0 - elif fixedFirstBin > 0: - binID = 1 - - if binID >= bins: - binID = bins - 1 - - try: - regionsBins[regionID][binID] += normalizedTag * weight - except KeyError: - print "%s %s" % (regionID, str(binID)) + positionInRegion = tagStart - start else: - rdist = rlen - startdist - binID = rdist / regionBinLength - if (fixedFirstBin > 0) and (rdist < fixedFirstBin): - binID = 0 - elif fixedFirstBin > 0: - binID = 1 - - if binID >= bins: - binID = bins - 1 - - try: - regionsBins[regionID][binID] += normalizedTag * weight - except KeyError: - print "%s %s" % (regionID, str(binID)) + positionInRegion = start + rlen - tagStart + + # we are relying on python's integer division quirk + binID = positionInRegion / regionBinLength + if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin): + binID = 0 + elif fixedFirstBin > 0: + binID = 1 + + if binID >= bins: + binID = bins - 1 + + try: + regionsBins[regionID][binID] += normalizedTag * weight + except KeyError: + print "%s %s" % (regionID, str(binID)) stopPoint = stop