X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=commoncode.py;fp=commoncode.py;h=0e4fdbbb7229b5f0c7689218dd9f1cbf2ba40b55;hp=cd449620f0fc88304f4e8adceee0e96e37e1295d;hb=4ad5495359e4322da39868020a7398676261679e;hpb=cfc5602b26323ad2365295145e3f6c622d912eb4 diff --git a/commoncode.py b/commoncode.py index cd44962..0e4fdbb 100755 --- a/commoncode.py +++ b/commoncode.py @@ -10,8 +10,8 @@ from cistematic.genomes import Genome import Region commoncodeVersion = 5.6 -currentRDSversion = 2.0 +#TODO: This is a function dumping ground - needs to be reworked class ErangeError(Exception): pass @@ -49,6 +49,78 @@ def writeLog(logFile, messenger, message): logfile.close() +def getChromInfoList(chrominfo): + chromInfoList = [] + linelist = open(chrominfo) + for line in linelist: + fields = line.strip().split('\t') + chr = fields[0] + start = 0 + end = int(fields[1]) + chromInfoList.append((chr,start,end)) + + linelist.close() + + return chromInfoList + + +# BAM file support functions +def isSpliceEntry(cigarTupleList): + isSplice = False + if cigarTupleList is not None: + for operation,length in cigarTupleList: + if operation == 3: + isSplice = True + break + + return isSplice + + +def addPairIDtoReadID(alignedread): + ID = alignedread.qname + if alignedread.is_read1: + ID = ID + '/1' + + if alignedread.is_read2: + ID = ID + '/2' + + return ID + + +def getHeaderComment(bamHeader, commentKey): + for comment in bamHeader["CO"]: + fields = comment.split("\t") + if fields[0] == commentKey: + return fields[1] + + raise KeyError + + +def getReadSense(read): + if read.is_reverse: + sense = "-" + else: + sense = "+" + + return sense + +#TODO: is this where we need to take into account the NH > 10 restriction? +def countThisRead(read, countUniqs, countMulti, countSplices, sense): + countRead = True + + if sense in ['-', '+'] and sense != getReadSense(read): + countRead = False + elif not countSplices and isSpliceEntry(read.cigar): + countRead = False + elif not countUniqs and read.opt('NH') == 1: + countRead = False + elif not countMulti and read.opt('NH') > 1: + countRead = False + + return countRead + + +# Cistematic functions def getGeneInfoDict(genome, cache=False): idb = geneinfoDB(cache=cache) if genome == "dmelanogaster": @@ -73,6 +145,7 @@ def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRA return geneannotDict +# Configuration File Support def getConfigParser(fileList=[]): configFiles = ["erange.config", os.path.expanduser("~/.erange.config")] for filename in fileList: @@ -129,6 +202,7 @@ def getAllConfigSectionOptions(parser, section): return setting +# All the Other Stuff from before def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False, fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False, doMerge=True, keepPeak=False, returnTop=0): @@ -192,7 +266,7 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will # be merged with A but that will exit the loop and the merge with C will be missed. - #TODO: QForAli - Are we going to require sorted region files to have this even work? + #TODO: Are we going to require sorted region files to have this even work? for regionEntry in regionList: if regionEntry[0] == "#": if "pvalue" in regionEntry: @@ -341,10 +415,6 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, # implementing a triangular smooth smoothArray = array("f", [0.] * length) - #TODO: QForAli - really? We're going to insert MANUFACTURED data (0.0's) and use them in the calculation - # (probably) just to avoid IndexError exceptions. Then we are going to completely ignore adjusting the - # last 2 elements of the array for (probably) the same issue. Is this how erange is dealing with the - # edge cases? for pos in range(2,length -2): smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0