4 from time import strftime
5 from array import array
6 from collections import defaultdict
8 from cistematic.core.geneinfo import geneinfoDB
9 from cistematic.genomes import Genome
12 commoncodeVersion = 5.6
14 #TODO: This is a function dumping ground - needs to be reworked
15 class ErangeError(Exception):
19 def getReverseComplement(base):
30 def countDuplicatesInList(listToCheck):
31 tally = defaultdict(int)
32 for item in listToCheck:
38 def writeLog(logFile, messenger, message):
39 """ create a log file to write a message from a messenger or append to an existing file.
42 logfile = open(logFile)
44 logfile = open(logFile, "w")
46 logfile = open(logFile, "a")
48 logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
52 def getChromInfoList(chrominfo):
54 linelist = open(chrominfo)
56 fields = line.strip().split('\t')
60 chromInfoList.append((chr,start,end))
67 # BAM file support functions
68 def isSpliceEntry(cigarTupleList):
70 if cigarTupleList is not None:
71 for operation,length in cigarTupleList:
79 def addPairIDtoReadID(alignedread):
80 ID = alignedread.qname
81 if alignedread.is_read1:
84 if alignedread.is_read2:
90 def getHeaderComment(bamHeader, commentKey):
91 for comment in bamHeader["CO"]:
92 fields = comment.split("\t")
93 if fields[0] == commentKey:
99 def getReadSense(read):
107 #TODO: is this where we need to take into account the NH > 10 restriction?
108 def countThisRead(read, countUniqs, countMulti, countSplices, sense):
111 if sense in ['-', '+'] and sense != getReadSense(read):
113 elif not countSplices and isSpliceEntry(read.cigar):
115 elif not countUniqs and read.opt('NH') == 1:
117 elif not countMulti and read.opt('NH') > 1:
123 # Cistematic functions
124 def getGeneInfoDict(genome, cache=False):
125 idb = geneinfoDB(cache=cache)
126 if genome == "dmelanogaster":
127 geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
129 geneinfoDict = idb.getallGeneInfo(genome)
134 def getGeneAnnotDict(genome, inRAM=False):
135 return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
138 def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
139 genome = Genome(genomeName, inRAM=inRAM)
140 if extendGenome != "":
141 genome.extendFeatures(extendGenome, replace=replaceModels)
143 geneannotDict = genome.allAnnotInfo()
148 # Configuration File Support
149 def getConfigParser(fileList=[]):
150 configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
151 for filename in fileList:
152 configFiles.append(filename)
154 config = ConfigParser.SafeConfigParser()
155 config.read(configFiles)
160 def getConfigOption(parser, section, option, default=None):
162 setting = parser.get(section, option)
163 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
169 def getConfigIntOption(parser, section, option, default=None):
171 setting = parser.getint(section, option)
172 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
178 def getConfigFloatOption(parser, section, option, default=None):
180 setting = parser.getfloat(section, option)
181 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
187 def getConfigBoolOption(parser, section, option, default=None):
189 setting = parser.getboolean(section, option)
190 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
196 def getAllConfigSectionOptions(parser, section):
198 setting = parser.items(section)
199 except ConfigParser.NoSectionError:
205 # All the Other Stuff from before
206 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
207 fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
208 doMerge=True, keepPeak=False, returnTop=0):
210 """ returns a dictionary containing a list of merged overlapping regions by chromosome;
211 can optionally filter regions that have a scoreField fewer than minHits.
212 Can also optionally return the label of each region, as well as the
213 peak, if supplied (peakPos and peakHeight should be the last 2 fields).
214 Can return the top regions based on score if higher than minHits.
216 infile = open(regionfilename)
217 lines = infile.readlines()
218 regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
219 fullChrom, chromField, scoreField, pad, compact,
220 doMerge, keepPeak, returnTop)
227 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
228 fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
229 doMerge=True, keepPeak=False, returnTop=0):
230 """ returns a dictionary containing a list of merged overlapping regions by chromosome;
231 can optionally filter regions that have a scoreField fewer than minHits.
232 Can also optionally return the label of each region, as well as the
233 peak, if supplied (peakPos and peakHeight should be the last 2 fields).
234 Can return the top regions based on score if higher than minHits.
239 if 0 < returnTop < len(regionList):
241 for regionEntry in regionList:
242 if regionEntry[0] == "#":
243 if "pvalue" in regionEntry:
246 if "readShift" in regionEntry:
251 fields = regionEntry.strip().split("\t")
252 hits = float(fields[scoreField].strip())
256 returnTop = -1 * returnTop
257 minScore = scores[returnTop]
258 if minScore > minHits:
262 chromField = int(chromField)
264 #TODO: Current algorithm processes input file line by line and compares with prior lines. Problem is it
265 # exits at the first merge. This is not a problem when the input is sorted by start position, but in
266 # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
267 # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will
268 # be merged with A but that will exit the loop and the merge with C will be missed.
269 #TODO: Are we going to require sorted region files to have this even work?
270 for regionEntry in regionList:
271 if regionEntry[0] == "#":
272 if "pvalue" in regionEntry:
275 if "readShift" in regionEntry:
280 fields = regionEntry.strip().split("\t")
283 hits = float(fields[scoreField].strip())
284 except (IndexError, ValueError):
291 (chrom, pos) = fields[chromField].split(":")
292 (front, back) = pos.split("-")
296 label = string.join(fields[:chromField],"\t")
297 chrom = fields[chromField]
298 start = int(fields[chromField + 1]) - pad
299 stop = int(fields[chromField + 2]) + pad
303 start = int(fields[2]) - pad
304 stop = int(fields[3]) + pad
310 peakPos = int(fields[-2 - hasPvalue - hasShift])
311 peakHeight = float(fields[-1 - hasPvalue - hasShift])
313 if chrom not in regions:
318 if doMerge and len(regions[chrom]) > 0:
319 for index in range(len(regions[chrom])):
320 region = regions[chrom][index]
321 rstart = region.start
323 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
331 rpeakPos = region.peakPos
332 rpeakHeight = region.peakHeight
333 if peakHeight > rpeakHeight:
334 rpeakHeight = peakHeight
337 regions[chrom][index].start = rstart
338 regions[chrom][index].stop = rstop
339 regions[chrom][index].length = abs(rstop - rstart)
341 regions[chrom][index].label = label
344 regions[chrom][index].peakPos = rpeakPos
345 regions[chrom][index].peakHeight = rpeakHeight
353 region = Region.Region(start, stop)
358 region.peakPos = peakPos
359 region.peakHeight = peakHeight
361 regions[chrom].append(region)
364 if verbose and (count % 100000 == 0):
368 for chrom in regions:
369 regionCount += len(regions[chrom])
370 regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
373 print "merged %d times" % mergeCount
374 print "returning %d regions" % regionCount
379 def regionsOverlap(start, stop, rstart, rstop):
381 (start, stop) = (stop, start)
384 (rstart, rstop) = (rstop, rstart)
386 return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
389 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
391 (start, stop) = (stop, start)
394 (rstart, rstop) = (rstop, rstart)
396 return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
399 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
400 shift=0, maxshift=75):
401 """ find the peak in a list of reads (hitlist) in a region
402 of a given length and absolute start point. returns a
403 list of peaks, the number of hits, a triangular-smoothed
404 version of hitlist, and the number of reads that are
405 forward (plus) sense.
406 If doWeight is True, weight the reads accordingly.
407 If leftPlus is True, return the number of plus reads left of
408 the peak, taken to be the first TopPos position.
412 shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
414 seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
416 # implementing a triangular smooth
417 smoothArray = array("f", [0.] * length)
418 for pos in range(2,length -2):
419 smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
421 topPos = getPeakPositionList(smoothArray, length)
422 peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
427 for read in regionArray:
429 weight = read["weight"]
433 currentPos = read["start"] - start
434 if currentPos <= maxPos and read["sense"] == "+":
435 numLeftPlus += weight
437 peak.numLeftPlus = numLeftPlus
442 def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
444 lowestScore = 20000000000
445 for testShift in xrange(maxShift + 1):
446 shiftArray = array("f", [0.] * length)
447 for read in readList:
448 currentpos = read["start"] - start
449 if read["sense"] == "+":
454 currentpos += testShift * senseModifier
455 if (currentpos < 1) or (currentpos >= length):
459 weight = read["weight"]
463 shiftArray[currentpos] += weight * senseModifier
466 for score in shiftArray:
467 currentScore += abs(score)
469 if currentScore < lowestScore:
470 bestShift = testShift
471 lowestScore = currentScore
476 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
477 seqArray = array("f", [0.] * length)
482 currentpos = read["start"] - start
483 if read["sense"] == "+":
488 if (currentpos < 1 - readlen) or (currentpos >= length):
492 weight = read["weight"]
498 regionArray.append(read)
501 while currentpos < 0:
505 while hitIndex < readlen and currentpos < length:
506 seqArray[currentpos] += weight
510 if read["sense"] == "+":
513 return seqArray, regionArray, numHits, numPlus
516 def getPeakPositionList(smoothArray, length):
519 for currentpos in xrange(length):
520 if topNucleotide < smoothArray[currentpos]:
521 topNucleotide = smoothArray[currentpos]
522 peakList = [currentpos]
523 elif topNucleotide == smoothArray[currentpos]:
524 peakList.append(currentpos)
529 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
530 restrictList=[], regionComplement=False, maxStop=250000000):
531 """ return a dictionary of cistematic gene features. Requires
532 cistematic, obviously. Can filter-out pseudogenes. Will use
533 additional regions dict to supplement gene models, if available.
534 Can restrict output to a list of GIDs.
535 If regionComplement is set to true, returns the regions *outside* of the
536 calculated boundaries, which is useful for retrieving intronic and
537 intergenic regions. maxStop is simply used to define the uppermost
538 boundary of the complement region.
540 featuresDict = genomeObject.getallGeneFeatures()
542 if len(restrictList) > 0:
545 if len(additionalRegionsDict) > 0:
547 for chrom in additionalRegionsDict:
548 for region in additionalRegionsDict[chrom]:
550 if label not in sortList:
551 sortList.append(label)
553 if label not in featuresDict:
554 featuresDict[label] = []
557 sense = featuresDict[label][0][-1]
559 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
562 featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
564 featuresByChromDict = {}
565 for gid in featuresDict:
566 if restrictGID and gid not in restrictList:
569 newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
570 if newFeatureList and chrom not in featuresByChromDict:
571 featuresByChromDict[chrom] = []
573 for (start, stop, ftype) in newFeatureList:
574 featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
576 for chrom in featuresByChromDict:
577 featuresByChromDict[chrom].sort()
580 complementByChromDict = {}
582 for chrom in featuresByChromDict:
583 complementByChromDict[chrom] = []
584 listLength = len(featuresByChromDict[chrom])
587 for index in range(listLength):
588 currentStop = featuresByChromDict[chrom][index][0]
590 if currentStart < currentStop:
591 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
593 currentStart = featuresByChromDict[chrom][index][1]
595 currentStop = maxStop
596 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
598 return (featuresByChromDict, complementByChromDict)
600 return featuresByChromDict
603 def getNewFeatureList(featureList, ignorePseudo=False):
605 for (featureType, chrom, start, stop, sense) in featureList:
606 if featureType == "PSEUDO" and ignorePseudo:
607 return [], chrom, sense
609 if (start, stop, featureType) not in newFeatureList:
612 for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
613 if start >= newFeatureStart and stop <= newFeatureStop:
616 if start < newFeatureStart and stop > newFeatureStop:
617 containedList.append((newFeatureStart, newFeatureStop))
619 if len(containedList) > 0:
622 for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
623 if (newFeatureStart, newFeatureStop) not in containedList:
624 newFList.append((newFeatureStart, newFeatureStop, featureType2))
625 if start >= newFeatureStart and stop <= newFeatureStop:
628 newFeatureList = newFList
631 newFeatureList.append((start, stop, featureType))
633 return newFeatureList, chrom, sense
636 def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
637 additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
638 lengthCDS=0, keepSense=False, adjustToNeighbor=True):
639 """ return a dictionary of gene loci. Can be used to retrieve additional
640 sequence upstream or downstream of gene, up to the next gene. Requires
641 cistematic, obviously.
642 Can filter-out pseudogenes and use additional regions outside of existing
643 gene models. Use upstreamSpanTSS to overlap half of the upstream region
645 If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
646 lengthCDS < 0bp, return only the last X bp from CDS.
648 locusByChromDict = {}
649 if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
650 return locusByChromDict
652 genomeName = genome.genome
653 featuresDict = getGeneFeatures(genome, additionalRegionsDict)
654 for gid in featuresDict:
655 featureList = featuresDict[gid]
657 for (ftype, chrom, start, stop, sense) in featureList:
658 newFeatureList.append((start, stop))
660 if ignorePseudo and ftype == "PSEUDO":
663 newFeatureList.sort()
665 sense = featureList[0][-1]
666 gstart = newFeatureList[0][0]
667 gstop = newFeatureList[-1][1]
668 glen = abs(gstart - gstop)
670 if not useCDS and upstream > 0:
672 if gstop > (gstart + upstream / 2):
673 gstop = gstart + upstream / 2
676 elif not useCDS and downstream > 0:
681 distance = upstream / 2
686 nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
687 if nextGene < distance * 2:
688 distance = nextGene / 2
690 distance = max(distance, 1)
694 distance = downstream
696 nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
697 if nextGene < downstream * 2:
698 distance = nextGene / 2
700 distance = max(distance, 1)
703 if 0 < lengthCDS < glen:
704 gstop = newFeatureList[0][0] + lengthCDS
706 if lengthCDS < 0 and abs(lengthCDS) < glen:
707 gstart = newFeatureList[-1][1] + lengthCDS
709 if not useCDS and upstream > 0:
711 if gstart < (gstop - upstream / 2):
712 gstart = gstop - upstream / 2
715 elif not useCDS and downstream > 0:
720 distance = upstream /2
725 nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
726 if nextGene < distance * 2:
727 distance = nextGene / 2
729 distance = max(distance, 1)
733 distance = downstream
735 nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
736 if nextGene < downstream * 2:
737 distance = nextGene / 2
739 distance = max(distance, 1)
742 if 0 < lengthCDS < glen:
743 gstart = newFeatureList[-1][-1] - lengthCDS
745 if lengthCDS < 0 and abs(lengthCDS) < glen:
746 gstop = newFeatureList[0][0] - lengthCDS
748 if chrom not in locusByChromDict:
749 locusByChromDict[chrom] = []
752 locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
754 locusByChromDict[chrom].append((gstart, gstop, gid, glen))
756 for chrom in locusByChromDict:
757 locusByChromDict[chrom].sort()
759 return locusByChromDict
762 def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
763 if upstream == 0 and downstream == 0 and not useCDS:
764 print "getLocusByChromDict: asked for no sequence - returning empty dict"
766 elif upstream > 0 and downstream > 0 and not useCDS:
767 print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
769 elif lengthCDS != 0 and not useCDS:
770 print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
772 elif upstreamSpanTSS and lengthCDS != 0:
773 print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
775 elif lengthCDS > 0 and downstream > 0:
776 print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
778 elif lengthCDS < 0 and upstream > 0:
779 print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
785 def getGeneFeatures(genome, additionalRegionsDict):
786 featuresDict = genome.getallGeneFeatures()
787 if len(additionalRegionsDict) > 0:
789 for chrom in additionalRegionsDict:
790 for region in additionalRegionsDict[chrom]:
792 if label not in sortList:
793 sortList.append(label)
795 if label not in featuresDict:
796 featuresDict[label] = []
799 sense = featuresDict[label][0][-1]
801 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
804 featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
809 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
810 normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
812 """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
813 a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
814 or a different weight / tag.
820 if defaultRegionFormat:
833 print "entering computeRegionBins"
834 if len(regionList) > 0:
835 for readID in regionList:
836 regionsBins[readID] = [0.] * bins
838 for chrom in regionsByChromDict:
839 for regionTuple in regionsByChromDict[chrom]:
840 regionID = regionTuple[regionIDField]
841 regionsBins[regionID] = [0.] * bins
843 for chrom in hitDict:
844 if chrom not in regionsByChromDict:
847 for regionTuple in regionsByChromDict[chrom]:
848 regionID = regionTuple[regionIDField]
849 regionsLen[regionID] = regionTuple[lengthField]
853 for read in hitDict[chrom]:
854 tagStart = read["start"]
855 weight = read["weight"]
857 if index % 100000 == 0:
858 print "read %d " % index,
860 stopPoint = tagStart + readlen
864 for regionTuple in regionsByChromDict[chrom][startRegion:]:
865 start = regionTuple[startField]
866 stop = regionTuple[stopField]
867 regionID = regionTuple[regionIDField]
868 rlen = regionTuple[lengthField]
870 rsense = regionTuple[senseField]
878 if start > stopPoint:
882 if start <= tagStart <= stop:
884 regionBinLength = rlen / bins
886 regionBinLength = binLength
889 positionInRegion = tagStart - start
891 positionInRegion = start + rlen - tagStart
893 # we are relying on python's integer division quirk
894 binID = positionInRegion / regionBinLength
895 if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
897 elif fixedFirstBin > 0:
904 regionsBins[regionID][binID] += normalizedTag * weight
906 print "%s %s" % (regionID, str(binID))
910 return (regionsBins, regionsLen)