X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=commoncode.py;fp=commoncode.py;h=f526daacca0dc91245e71205363216bc17eba2e9;hp=821936a30bc2524f0aa1664de1a66ecb8ae47fd6;hb=77dccd7c98d8cdb60caaf178b1123df71ea662c9;hpb=bc30aca13e5ec397c92e67002fbf7a103130b828 diff --git a/commoncode.py b/commoncode.py index 821936a..f526daa 100755 --- a/commoncode.py +++ b/commoncode.py @@ -17,6 +17,9 @@ import Region commoncodeVersion = 5.6 currentRDSversion = 2.0 +class ErangeError(Exception): + pass + def getReverseComplement(base): revComp = {"A": "T", @@ -65,12 +68,12 @@ def getGeneAnnotDict(genome, inRAM=False): return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM) -def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False): - hg = Genome(genome, inRAM=inRAM) +def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False): + genome = Genome(genomeName, inRAM=inRAM) if extendGenome != "": - hg.extendFeatures(extendGenome, replace=replaceModels) + genome.extendFeatures(extendGenome, replace=replaceModels) - geneannotDict = hg.allAnnotInfo() + geneannotDict = genome.allAnnotInfo() return geneannotDict @@ -189,11 +192,6 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, mergeCount = 0 chromField = int(chromField) count = 0 - #TODO: Current algorithm processes input file line by line and compares with prior lines. Problem is it - # exits at the first merge. This is not a problem when the input is sorted by start position, but in - # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there - # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will - # be merged with A but that will exit the loop and the merge with C will be missed. for regionEntry in regionList: if regionEntry[0] == "#": if "pvalue" in regionEntry: @@ -336,7 +334,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, """ if shift == "auto": - shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift) + shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift) seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus) @@ -346,7 +344,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0 topPos = getPeakPositionList(smoothArray, length) - peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift) + peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift) if leftPlus: numLeftPlus = 0 @@ -366,12 +364,12 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, return peak -def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75): +def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75): bestShift = 0 lowestScore = 20000000000 for testShift in xrange(maxShift + 1): shiftArray = array("f", [0.] * length) - for read in hitList: + for read in readList: currentpos = read["start"] - start if read["sense"] == "+": currentpos += testShift @@ -381,7 +379,7 @@ def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75): if (currentpos < 1) or (currentpos >= length): continue - if doWeight: + if useWeight: weight = read["weight"] else: weight = 1.0 @@ -432,7 +430,7 @@ def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, left hitIndex += 1 currentpos += 1 - while hitIndex < readlen and currentpos < length: + while hitIndex < readlen and currentpos < length: seqArray[currentpos] += weight hitIndex += 1 currentpos += 1 @@ -450,7 +448,7 @@ def getPeakPositionList(smoothArray, length): if topNucleotide < smoothArray[currentpos]: topNucleotide = smoothArray[currentpos] peakList = [currentpos] - elif topNucleotide == smoothArray[currentpos]: + elif topNucleotide == smoothArray[currentpos]: peakList.append(currentpos) return peakList @@ -562,7 +560,7 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo= return featuresByChromDict -def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, +def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True, additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False, lengthCDS=0, keepSense=False, adjustToNeighbor=True): """ return a dictionary of gene loci. Can be used to retrieve additional @@ -594,27 +592,8 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict" return locusByChromDict - genome = genomeObject.genome - featuresDict = genomeObject.getallGeneFeatures() - if len(additionalRegionsDict) > 0: - sortList = [] - for chrom in additionalRegionsDict: - for region in additionalRegionsDict[chrom]: - label = region.label - if label not in sortList: - sortList.append(label) - - if label not in featuresDict: - featuresDict[label] = [] - sense = "+" - else: - sense = featuresDict[label][0][-1] - - featuresDict[label].append(("custom", chrom, region.start, region.stop, sense)) - - for gid in sortList: - featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2])) - + genomeName = genome.genome + featuresDict = getGeneFeatures(genome, additionalRegionsDict) for gid in featuresDict: featureList = featuresDict[gid] newFeatureList = [] @@ -647,34 +626,28 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, distance = upstream if adjustToNeighbor: - nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2) + nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2) if nextGene < distance * 2: distance = nextGene / 2 - if distance < 1: - distance = 1 - + distance = max(distance, 1) gstart -= distance if downstream > 0: distance = downstream if adjustToNeighbor: - nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2) + nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2) if nextGene < downstream * 2: distance = nextGene / 2 - if distance < 1: - distance = 1 - + distance = max(distance, 1) gstop += distance - if lengthCDS > 0: - if lengthCDS < glen: - gstop = newFeatureList[0][0] + lengthCDS + if 0 < lengthCDS < glen: + gstop = newFeatureList[0][0] + lengthCDS - if lengthCDS < 0: - if abs(lengthCDS) < glen: - gstart = newFeatureList[-1][1] + lengthCDS + if lengthCDS < 0 and abs(lengthCDS) < glen: + gstart = newFeatureList[-1][1] + lengthCDS else: if not useCDS and upstream > 0: if upstreamSpanTSS: @@ -692,36 +665,29 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, distance = upstream if adjustToNeighbor: - nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2) + nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2) if nextGene < distance * 2: distance = nextGene / 2 - if distance < 1: - distance = 1 - + distance = max(distance, 1) gstop += distance if downstream > 0: distance = downstream if adjustToNeighbor: - nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2) + nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2) if nextGene < downstream * 2: distance = nextGene / 2 - if distance < 1: - distance = 1 - + distance = max(distance, 1) gstart -= distance - if lengthCDS > 0: - if lengthCDS < glen: - gstart = newFeatureList[-1][-1] - lengthCDS + if 0 < lengthCDS < glen: + gstart = newFeatureList[-1][-1] - lengthCDS - if lengthCDS < 0: - if abs(lengthCDS) < glen: - gstop = newFeatureList[0][0] - lengthCDS + if lengthCDS < 0 and abs(lengthCDS) < glen: + gstop = newFeatureList[0][0] - lengthCDS - glen = abs(gstop - gstart) if chrom not in locusByChromDict: locusByChromDict[chrom] = [] @@ -736,6 +702,30 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, return locusByChromDict +def getGeneFeatures(genome, additionalRegionsDict): + featuresDict = genome.getallGeneFeatures() + if len(additionalRegionsDict) > 0: + sortList = [] + for chrom in additionalRegionsDict: + for region in additionalRegionsDict[chrom]: + label = region.label + if label not in sortList: + sortList.append(label) + + if label not in featuresDict: + featuresDict[label] = [] + sense = "+" + else: + sense = featuresDict[label][0][-1] + + featuresDict[label].append(("custom", chrom, region.start, region.stop, sense)) + + for gid in sortList: + featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2])) + + return featuresDict + + def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[], normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1, binLength=-1): @@ -798,7 +788,7 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[], rlen = regionTuple[lengthField] try: rsense = regionTuple[senseField] - except: + except IndexError: rsense = "F" if tagStart > stop: