X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=commoncode.py;h=0e4fdbbb7229b5f0c7689218dd9f1cbf2ba40b55;hp=821936a30bc2524f0aa1664de1a66ecb8ae47fd6;hb=HEAD;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/commoncode.py b/commoncode.py index 821936a..0e4fdbb 100755 --- a/commoncode.py +++ b/commoncode.py @@ -1,8 +1,3 @@ -# -# commoncode.py -# ENRAGE -# - import ConfigParser import os import string @@ -15,7 +10,10 @@ from cistematic.genomes import Genome import Region commoncodeVersion = 5.6 -currentRDSversion = 2.0 + +#TODO: This is a function dumping ground - needs to be reworked +class ErangeError(Exception): + pass def getReverseComplement(base): @@ -51,6 +49,78 @@ def writeLog(logFile, messenger, message): logfile.close() +def getChromInfoList(chrominfo): + chromInfoList = [] + linelist = open(chrominfo) + for line in linelist: + fields = line.strip().split('\t') + chr = fields[0] + start = 0 + end = int(fields[1]) + chromInfoList.append((chr,start,end)) + + linelist.close() + + return chromInfoList + + +# BAM file support functions +def isSpliceEntry(cigarTupleList): + isSplice = False + if cigarTupleList is not None: + for operation,length in cigarTupleList: + if operation == 3: + isSplice = True + break + + return isSplice + + +def addPairIDtoReadID(alignedread): + ID = alignedread.qname + if alignedread.is_read1: + ID = ID + '/1' + + if alignedread.is_read2: + ID = ID + '/2' + + return ID + + +def getHeaderComment(bamHeader, commentKey): + for comment in bamHeader["CO"]: + fields = comment.split("\t") + if fields[0] == commentKey: + return fields[1] + + raise KeyError + + +def getReadSense(read): + if read.is_reverse: + sense = "-" + else: + sense = "+" + + return sense + +#TODO: is this where we need to take into account the NH > 10 restriction? +def countThisRead(read, countUniqs, countMulti, countSplices, sense): + countRead = True + + if sense in ['-', '+'] and sense != getReadSense(read): + countRead = False + elif not countSplices and isSpliceEntry(read.cigar): + countRead = False + elif not countUniqs and read.opt('NH') == 1: + countRead = False + elif not countMulti and read.opt('NH') > 1: + countRead = False + + return countRead + + +# Cistematic functions def getGeneInfoDict(genome, cache=False): idb = geneinfoDB(cache=cache) if genome == "dmelanogaster": @@ -65,16 +135,17 @@ def getGeneAnnotDict(genome, inRAM=False): return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM) -def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False): - hg = Genome(genome, inRAM=inRAM) +def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False): + genome = Genome(genomeName, inRAM=inRAM) if extendGenome != "": - hg.extendFeatures(extendGenome, replace=replaceModels) + genome.extendFeatures(extendGenome, replace=replaceModels) - geneannotDict = hg.allAnnotInfo() + geneannotDict = genome.allAnnotInfo() return geneannotDict +# Configuration File Support def getConfigParser(fileList=[]): configFiles = ["erange.config", os.path.expanduser("~/.erange.config")] for filename in fileList: @@ -131,6 +202,7 @@ def getAllConfigSectionOptions(parser, section): return setting +# All the Other Stuff from before def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False, fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False, doMerge=True, keepPeak=False, returnTop=0): @@ -194,6 +266,7 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will # be merged with A but that will exit the loop and the merge with C will be missed. + #TODO: Are we going to require sorted region files to have this even work? for regionEntry in regionList: if regionEntry[0] == "#": if "pvalue" in regionEntry: @@ -336,7 +409,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, """ if shift == "auto": - shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift) + shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift) seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus) @@ -346,7 +419,7 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0 topPos = getPeakPositionList(smoothArray, length) - peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift) + peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift) if leftPlus: numLeftPlus = 0 @@ -366,36 +439,33 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, return peak -def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75): +def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75): bestShift = 0 lowestScore = 20000000000 for testShift in xrange(maxShift + 1): shiftArray = array("f", [0.] * length) - for read in hitList: + for read in readList: currentpos = read["start"] - start if read["sense"] == "+": - currentpos += testShift + senseModifier = 1 else: - currentpos -= testShift + senseModifier = -1 + currentpos += testShift * senseModifier if (currentpos < 1) or (currentpos >= length): continue - if doWeight: + if useWeight: weight = read["weight"] else: weight = 1.0 - if read["sense"] == "+": - shiftArray[currentpos] += weight - else: - shiftArray[currentpos] -= weight + shiftArray[currentpos] += weight * senseModifier currentScore = 0 for score in shiftArray: currentScore += abs(score) - print currentScore if currentScore < lowestScore: bestShift = testShift lowestScore = currentScore @@ -432,7 +502,7 @@ def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, left hitIndex += 1 currentpos += 1 - while hitIndex < readlen and currentpos < length: + while hitIndex < readlen and currentpos < length: seqArray[currentpos] += weight hitIndex += 1 currentpos += 1 @@ -450,7 +520,7 @@ def getPeakPositionList(smoothArray, length): if topNucleotide < smoothArray[currentpos]: topNucleotide = smoothArray[currentpos] peakList = [currentpos] - elif topNucleotide == smoothArray[currentpos]: + elif topNucleotide == smoothArray[currentpos]: peakList.append(currentpos) return peakList @@ -496,40 +566,8 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo= if restrictGID and gid not in restrictList: continue - featureList = featuresDict[gid] - newFeatureList = [] - isPseudo = False - for (ftype, chrom, start, stop, sense) in featureList: - if ftype == "PSEUDO": - isPseudo = True - - if (start, stop, ftype) not in newFeatureList: - notContained = True - containedList = [] - for (fstart, fstop, ftype2) in newFeatureList: - if start >= fstart and stop <= fstop: - notContained = False - - if start < fstart and stop > fstop: - containedList.append((fstart, fstop)) - - if len(containedList) > 0: - newFList = [] - notContained = True - for (fstart, fstop, ftype2) in newFeatureList: - if (fstart, fstop) not in containedList: - newFList.append((fstart, fstop, ftype2)) - if start >= fstart and stop <= fstop: - notContained = False - - newFeatureList = newFList - if notContained: - newFeatureList.append((start, stop, ftype)) - - if ignorePseudo and isPseudo: - continue - - if chrom not in featuresByChromDict: + newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid]) + if newFeatureList and chrom not in featuresByChromDict: featuresByChromDict[chrom] = [] for (start, stop, ftype) in newFeatureList: @@ -562,7 +600,40 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo= return featuresByChromDict -def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, +def getNewFeatureList(featureList, ignorePseudo=False): + newFeatureList = [] + for (featureType, chrom, start, stop, sense) in featureList: + if featureType == "PSEUDO" and ignorePseudo: + return [], chrom, sense + + if (start, stop, featureType) not in newFeatureList: + notContained = True + containedList = [] + for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList: + if start >= newFeatureStart and stop <= newFeatureStop: + notContained = False + + if start < newFeatureStart and stop > newFeatureStop: + containedList.append((newFeatureStart, newFeatureStop)) + + if len(containedList) > 0: + newFList = [] + notContained = True + for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList: + if (newFeatureStart, newFeatureStop) not in containedList: + newFList.append((newFeatureStart, newFeatureStop, featureType2)) + if start >= newFeatureStart and stop <= newFeatureStop: + notContained = False + + newFeatureList = newFList + + if notContained: + newFeatureList.append((start, stop, featureType)) + + return newFeatureList, chrom, sense + + +def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True, additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False, lengthCDS=0, keepSense=False, adjustToNeighbor=True): """ return a dictionary of gene loci. Can be used to retrieve additional @@ -575,46 +646,11 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, lengthCDS < 0bp, return only the last X bp from CDS. """ locusByChromDict = {} - if upstream == 0 and downstream == 0 and not useCDS: - print "getLocusByChromDict: asked for no sequence - returning empty dict" - return locusByChromDict - elif upstream > 0 and downstream > 0 and not useCDS: - print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict" - return locusByChromDict - elif lengthCDS != 0 and not useCDS: - print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict" - return locusByChromDict - elif upstreamSpanTSS and lengthCDS != 0: - print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict" - return locusByChromDict - elif lengthCDS > 0 and downstream > 0: - print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict" - return locusByChromDict - elif lengthCDS < 0 and upstream > 0: - print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict" + if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS): return locusByChromDict - genome = genomeObject.genome - featuresDict = genomeObject.getallGeneFeatures() - if len(additionalRegionsDict) > 0: - sortList = [] - for chrom in additionalRegionsDict: - for region in additionalRegionsDict[chrom]: - label = region.label - if label not in sortList: - sortList.append(label) - - if label not in featuresDict: - featuresDict[label] = [] - sense = "+" - else: - sense = featuresDict[label][0][-1] - - featuresDict[label].append(("custom", chrom, region.start, region.stop, sense)) - - for gid in sortList: - featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2])) - + genomeName = genome.genome + featuresDict = getGeneFeatures(genome, additionalRegionsDict) for gid in featuresDict: featureList = featuresDict[gid] newFeatureList = [] @@ -647,34 +683,28 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, distance = upstream if adjustToNeighbor: - nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2) + nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2) if nextGene < distance * 2: distance = nextGene / 2 - if distance < 1: - distance = 1 - + distance = max(distance, 1) gstart -= distance if downstream > 0: distance = downstream if adjustToNeighbor: - nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2) + nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2) if nextGene < downstream * 2: distance = nextGene / 2 - if distance < 1: - distance = 1 - + distance = max(distance, 1) gstop += distance - if lengthCDS > 0: - if lengthCDS < glen: - gstop = newFeatureList[0][0] + lengthCDS + if 0 < lengthCDS < glen: + gstop = newFeatureList[0][0] + lengthCDS - if lengthCDS < 0: - if abs(lengthCDS) < glen: - gstart = newFeatureList[-1][1] + lengthCDS + if lengthCDS < 0 and abs(lengthCDS) < glen: + gstart = newFeatureList[-1][1] + lengthCDS else: if not useCDS and upstream > 0: if upstreamSpanTSS: @@ -692,36 +722,29 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, distance = upstream if adjustToNeighbor: - nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2) + nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2) if nextGene < distance * 2: distance = nextGene / 2 - if distance < 1: - distance = 1 - + distance = max(distance, 1) gstop += distance if downstream > 0: distance = downstream if adjustToNeighbor: - nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2) + nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2) if nextGene < downstream * 2: distance = nextGene / 2 - if distance < 1: - distance = 1 - + distance = max(distance, 1) gstart -= distance - if lengthCDS > 0: - if lengthCDS < glen: - gstart = newFeatureList[-1][-1] - lengthCDS + if 0 < lengthCDS < glen: + gstart = newFeatureList[-1][-1] - lengthCDS - if lengthCDS < 0: - if abs(lengthCDS) < glen: - gstop = newFeatureList[0][0] - lengthCDS + if lengthCDS < 0 and abs(lengthCDS) < glen: + gstop = newFeatureList[0][0] - lengthCDS - glen = abs(gstop - gstart) if chrom not in locusByChromDict: locusByChromDict[chrom] = [] @@ -736,6 +759,53 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, return locusByChromDict +def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS): + if upstream == 0 and downstream == 0 and not useCDS: + print "getLocusByChromDict: asked for no sequence - returning empty dict" + return True + elif upstream > 0 and downstream > 0 and not useCDS: + print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict" + return True + elif lengthCDS != 0 and not useCDS: + print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict" + return True + elif upstreamSpanTSS and lengthCDS != 0: + print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict" + return True + elif lengthCDS > 0 and downstream > 0: + print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict" + return True + elif lengthCDS < 0 and upstream > 0: + print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict" + return True + + return False + + +def getGeneFeatures(genome, additionalRegionsDict): + featuresDict = genome.getallGeneFeatures() + if len(additionalRegionsDict) > 0: + sortList = [] + for chrom in additionalRegionsDict: + for region in additionalRegionsDict[chrom]: + label = region.label + if label not in sortList: + sortList.append(label) + + if label not in featuresDict: + featuresDict[label] = [] + sense = "+" + else: + sense = featuresDict[label][0][-1] + + featuresDict[label].append(("custom", chrom, region.start, region.stop, sense)) + + for gid in sortList: + featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2])) + + return featuresDict + + def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[], normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1, binLength=-1): @@ -798,7 +868,7 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[], rlen = regionTuple[lengthField] try: rsense = regionTuple[senseField] - except: + except IndexError: rsense = "F" if tagStart > stop: @@ -815,37 +885,25 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[], else: regionBinLength = binLength - startdist = tagStart - start if rsense == "F": - # we are relying on python's integer division quirk - binID = startdist / regionBinLength - if (fixedFirstBin > 0) and (startdist < fixedFirstBin): - binID = 0 - elif fixedFirstBin > 0: - binID = 1 - - if binID >= bins: - binID = bins - 1 - - try: - regionsBins[regionID][binID] += normalizedTag * weight - except KeyError: - print "%s %s" % (regionID, str(binID)) + positionInRegion = tagStart - start else: - rdist = rlen - startdist - binID = rdist / regionBinLength - if (fixedFirstBin > 0) and (rdist < fixedFirstBin): - binID = 0 - elif fixedFirstBin > 0: - binID = 1 - - if binID >= bins: - binID = bins - 1 - - try: - regionsBins[regionID][binID] += normalizedTag * weight - except KeyError: - print "%s %s" % (regionID, str(binID)) + positionInRegion = start + rlen - tagStart + + # we are relying on python's integer division quirk + binID = positionInRegion / regionBinLength + if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin): + binID = 0 + elif fixedFirstBin > 0: + binID = 1 + + if binID >= bins: + binID = bins - 1 + + try: + regionsBins[regionID][binID] += normalizedTag * weight + except KeyError: + print "%s %s" % (regionID, str(binID)) stopPoint = stop