X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=commoncode.py;h=0e4fdbbb7229b5f0c7689218dd9f1cbf2ba40b55;hp=f526daacca0dc91245e71205363216bc17eba2e9;hb=HEAD;hpb=77dccd7c98d8cdb60caaf178b1123df71ea662c9 diff --git a/commoncode.py b/commoncode.py index f526daa..0e4fdbb 100755 --- a/commoncode.py +++ b/commoncode.py @@ -1,8 +1,3 @@ -# -# commoncode.py -# ENRAGE -# - import ConfigParser import os import string @@ -15,8 +10,8 @@ from cistematic.genomes import Genome import Region commoncodeVersion = 5.6 -currentRDSversion = 2.0 +#TODO: This is a function dumping ground - needs to be reworked class ErangeError(Exception): pass @@ -54,6 +49,78 @@ def writeLog(logFile, messenger, message): logfile.close() +def getChromInfoList(chrominfo): + chromInfoList = [] + linelist = open(chrominfo) + for line in linelist: + fields = line.strip().split('\t') + chr = fields[0] + start = 0 + end = int(fields[1]) + chromInfoList.append((chr,start,end)) + + linelist.close() + + return chromInfoList + + +# BAM file support functions +def isSpliceEntry(cigarTupleList): + isSplice = False + if cigarTupleList is not None: + for operation,length in cigarTupleList: + if operation == 3: + isSplice = True + break + + return isSplice + + +def addPairIDtoReadID(alignedread): + ID = alignedread.qname + if alignedread.is_read1: + ID = ID + '/1' + + if alignedread.is_read2: + ID = ID + '/2' + + return ID + + +def getHeaderComment(bamHeader, commentKey): + for comment in bamHeader["CO"]: + fields = comment.split("\t") + if fields[0] == commentKey: + return fields[1] + + raise KeyError + + +def getReadSense(read): + if read.is_reverse: + sense = "-" + else: + sense = "+" + + return sense + +#TODO: is this where we need to take into account the NH > 10 restriction? +def countThisRead(read, countUniqs, countMulti, countSplices, sense): + countRead = True + + if sense in ['-', '+'] and sense != getReadSense(read): + countRead = False + elif not countSplices and isSpliceEntry(read.cigar): + countRead = False + elif not countUniqs and read.opt('NH') == 1: + countRead = False + elif not countMulti and read.opt('NH') > 1: + countRead = False + + return countRead + + +# Cistematic functions def getGeneInfoDict(genome, cache=False): idb = geneinfoDB(cache=cache) if genome == "dmelanogaster": @@ -78,6 +145,7 @@ def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRA return geneannotDict +# Configuration File Support def getConfigParser(fileList=[]): configFiles = ["erange.config", os.path.expanduser("~/.erange.config")] for filename in fileList: @@ -134,6 +202,7 @@ def getAllConfigSectionOptions(parser, section): return setting +# All the Other Stuff from before def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False, fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False, doMerge=True, keepPeak=False, returnTop=0): @@ -192,6 +261,12 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, mergeCount = 0 chromField = int(chromField) count = 0 + #TODO: Current algorithm processes input file line by line and compares with prior lines. Problem is it + # exits at the first merge. This is not a problem when the input is sorted by start position, but in + # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there + # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will + # be merged with A but that will exit the loop and the merge with C will be missed. + #TODO: Are we going to require sorted region files to have this even work? for regionEntry in regionList: if regionEntry[0] == "#": if "pvalue" in regionEntry: @@ -372,10 +447,11 @@ def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75) for read in readList: currentpos = read["start"] - start if read["sense"] == "+": - currentpos += testShift + senseModifier = 1 else: - currentpos -= testShift + senseModifier = -1 + currentpos += testShift * senseModifier if (currentpos < 1) or (currentpos >= length): continue @@ -384,16 +460,12 @@ def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75) else: weight = 1.0 - if read["sense"] == "+": - shiftArray[currentpos] += weight - else: - shiftArray[currentpos] -= weight + shiftArray[currentpos] += weight * senseModifier currentScore = 0 for score in shiftArray: currentScore += abs(score) - print currentScore if currentScore < lowestScore: bestShift = testShift lowestScore = currentScore @@ -494,40 +566,8 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo= if restrictGID and gid not in restrictList: continue - featureList = featuresDict[gid] - newFeatureList = [] - isPseudo = False - for (ftype, chrom, start, stop, sense) in featureList: - if ftype == "PSEUDO": - isPseudo = True - - if (start, stop, ftype) not in newFeatureList: - notContained = True - containedList = [] - for (fstart, fstop, ftype2) in newFeatureList: - if start >= fstart and stop <= fstop: - notContained = False - - if start < fstart and stop > fstop: - containedList.append((fstart, fstop)) - - if len(containedList) > 0: - newFList = [] - notContained = True - for (fstart, fstop, ftype2) in newFeatureList: - if (fstart, fstop) not in containedList: - newFList.append((fstart, fstop, ftype2)) - if start >= fstart and stop <= fstop: - notContained = False - - newFeatureList = newFList - if notContained: - newFeatureList.append((start, stop, ftype)) - - if ignorePseudo and isPseudo: - continue - - if chrom not in featuresByChromDict: + newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid]) + if newFeatureList and chrom not in featuresByChromDict: featuresByChromDict[chrom] = [] for (start, stop, ftype) in newFeatureList: @@ -560,6 +600,39 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo= return featuresByChromDict +def getNewFeatureList(featureList, ignorePseudo=False): + newFeatureList = [] + for (featureType, chrom, start, stop, sense) in featureList: + if featureType == "PSEUDO" and ignorePseudo: + return [], chrom, sense + + if (start, stop, featureType) not in newFeatureList: + notContained = True + containedList = [] + for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList: + if start >= newFeatureStart and stop <= newFeatureStop: + notContained = False + + if start < newFeatureStart and stop > newFeatureStop: + containedList.append((newFeatureStart, newFeatureStop)) + + if len(containedList) > 0: + newFList = [] + notContained = True + for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList: + if (newFeatureStart, newFeatureStop) not in containedList: + newFList.append((newFeatureStart, newFeatureStop, featureType2)) + if start >= newFeatureStart and stop <= newFeatureStop: + notContained = False + + newFeatureList = newFList + + if notContained: + newFeatureList.append((start, stop, featureType)) + + return newFeatureList, chrom, sense + + def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True, additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False, lengthCDS=0, keepSense=False, adjustToNeighbor=True): @@ -573,23 +646,7 @@ def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True, lengthCDS < 0bp, return only the last X bp from CDS. """ locusByChromDict = {} - if upstream == 0 and downstream == 0 and not useCDS: - print "getLocusByChromDict: asked for no sequence - returning empty dict" - return locusByChromDict - elif upstream > 0 and downstream > 0 and not useCDS: - print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict" - return locusByChromDict - elif lengthCDS != 0 and not useCDS: - print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict" - return locusByChromDict - elif upstreamSpanTSS and lengthCDS != 0: - print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict" - return locusByChromDict - elif lengthCDS > 0 and downstream > 0: - print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict" - return locusByChromDict - elif lengthCDS < 0 and upstream > 0: - print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict" + if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS): return locusByChromDict genomeName = genome.genome @@ -702,6 +759,29 @@ def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True, return locusByChromDict +def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS): + if upstream == 0 and downstream == 0 and not useCDS: + print "getLocusByChromDict: asked for no sequence - returning empty dict" + return True + elif upstream > 0 and downstream > 0 and not useCDS: + print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict" + return True + elif lengthCDS != 0 and not useCDS: + print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict" + return True + elif upstreamSpanTSS and lengthCDS != 0: + print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict" + return True + elif lengthCDS > 0 and downstream > 0: + print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict" + return True + elif lengthCDS < 0 and upstream > 0: + print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict" + return True + + return False + + def getGeneFeatures(genome, additionalRegionsDict): featuresDict = genome.getallGeneFeatures() if len(additionalRegionsDict) > 0: @@ -805,37 +885,25 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[], else: regionBinLength = binLength - startdist = tagStart - start if rsense == "F": - # we are relying on python's integer division quirk - binID = startdist / regionBinLength - if (fixedFirstBin > 0) and (startdist < fixedFirstBin): - binID = 0 - elif fixedFirstBin > 0: - binID = 1 - - if binID >= bins: - binID = bins - 1 - - try: - regionsBins[regionID][binID] += normalizedTag * weight - except KeyError: - print "%s %s" % (regionID, str(binID)) + positionInRegion = tagStart - start else: - rdist = rlen - startdist - binID = rdist / regionBinLength - if (fixedFirstBin > 0) and (rdist < fixedFirstBin): - binID = 0 - elif fixedFirstBin > 0: - binID = 1 - - if binID >= bins: - binID = bins - 1 - - try: - regionsBins[regionID][binID] += normalizedTag * weight - except KeyError: - print "%s %s" % (regionID, str(binID)) + positionInRegion = start + rlen - tagStart + + # we are relying on python's integer division quirk + binID = positionInRegion / regionBinLength + if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin): + binID = 0 + elif fixedFirstBin > 0: + binID = 1 + + if binID >= bins: + binID = bins - 1 + + try: + regionsBins[regionID][binID] += normalizedTag * weight + except KeyError: + print "%s %s" % (regionID, str(binID)) stopPoint = stop