4 from time import strftime
5 from array import array
6 from collections import defaultdict
8 from cistematic.core.geneinfo import geneinfoDB
9 from cistematic.genomes import Genome
12 commoncodeVersion = 5.6
13 currentRDSversion = 2.0
15 class ErangeError(Exception):
19 def getReverseComplement(base):
30 def countDuplicatesInList(listToCheck):
31 tally = defaultdict(int)
32 for item in listToCheck:
38 def writeLog(logFile, messenger, message):
39 """ create a log file to write a message from a messenger or append to an existing file.
42 logfile = open(logFile)
44 logfile = open(logFile, "w")
46 logfile = open(logFile, "a")
48 logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
52 def getGeneInfoDict(genome, cache=False):
53 idb = geneinfoDB(cache=cache)
54 if genome == "dmelanogaster":
55 geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
57 geneinfoDict = idb.getallGeneInfo(genome)
62 def getGeneAnnotDict(genome, inRAM=False):
63 return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
66 def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
67 genome = Genome(genomeName, inRAM=inRAM)
68 if extendGenome != "":
69 genome.extendFeatures(extendGenome, replace=replaceModels)
71 geneannotDict = genome.allAnnotInfo()
76 def getConfigParser(fileList=[]):
77 configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
78 for filename in fileList:
79 configFiles.append(filename)
81 config = ConfigParser.SafeConfigParser()
82 config.read(configFiles)
87 def getConfigOption(parser, section, option, default=None):
89 setting = parser.get(section, option)
90 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
96 def getConfigIntOption(parser, section, option, default=None):
98 setting = parser.getint(section, option)
99 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
105 def getConfigFloatOption(parser, section, option, default=None):
107 setting = parser.getfloat(section, option)
108 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
114 def getConfigBoolOption(parser, section, option, default=None):
116 setting = parser.getboolean(section, option)
117 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
123 def getAllConfigSectionOptions(parser, section):
125 setting = parser.items(section)
126 except ConfigParser.NoSectionError:
132 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
133 fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
134 doMerge=True, keepPeak=False, returnTop=0):
136 """ returns a dictionary containing a list of merged overlapping regions by chromosome;
137 can optionally filter regions that have a scoreField fewer than minHits.
138 Can also optionally return the label of each region, as well as the
139 peak, if supplied (peakPos and peakHeight should be the last 2 fields).
140 Can return the top regions based on score if higher than minHits.
142 infile = open(regionfilename)
143 lines = infile.readlines()
144 regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
145 fullChrom, chromField, scoreField, pad, compact,
146 doMerge, keepPeak, returnTop)
153 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
154 fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
155 doMerge=True, keepPeak=False, returnTop=0):
156 """ returns a dictionary containing a list of merged overlapping regions by chromosome;
157 can optionally filter regions that have a scoreField fewer than minHits.
158 Can also optionally return the label of each region, as well as the
159 peak, if supplied (peakPos and peakHeight should be the last 2 fields).
160 Can return the top regions based on score if higher than minHits.
165 if 0 < returnTop < len(regionList):
167 for regionEntry in regionList:
168 if regionEntry[0] == "#":
169 if "pvalue" in regionEntry:
172 if "readShift" in regionEntry:
177 fields = regionEntry.strip().split("\t")
178 hits = float(fields[scoreField].strip())
182 returnTop = -1 * returnTop
183 minScore = scores[returnTop]
184 if minScore > minHits:
188 chromField = int(chromField)
190 #TODO: Current algorithm processes input file line by line and compares with prior lines. Problem is it
191 # exits at the first merge. This is not a problem when the input is sorted by start position, but in
192 # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
193 # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will
194 # be merged with A but that will exit the loop and the merge with C will be missed.
195 #TODO: QForAli - Are we going to require sorted region files to have this even work?
196 for regionEntry in regionList:
197 if regionEntry[0] == "#":
198 if "pvalue" in regionEntry:
201 if "readShift" in regionEntry:
206 fields = regionEntry.strip().split("\t")
209 hits = float(fields[scoreField].strip())
210 except (IndexError, ValueError):
217 (chrom, pos) = fields[chromField].split(":")
218 (front, back) = pos.split("-")
222 label = string.join(fields[:chromField],"\t")
223 chrom = fields[chromField]
224 start = int(fields[chromField + 1]) - pad
225 stop = int(fields[chromField + 2]) + pad
229 start = int(fields[2]) - pad
230 stop = int(fields[3]) + pad
236 peakPos = int(fields[-2 - hasPvalue - hasShift])
237 peakHeight = float(fields[-1 - hasPvalue - hasShift])
239 if chrom not in regions:
244 if doMerge and len(regions[chrom]) > 0:
245 for index in range(len(regions[chrom])):
246 region = regions[chrom][index]
247 rstart = region.start
249 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
257 rpeakPos = region.peakPos
258 rpeakHeight = region.peakHeight
259 if peakHeight > rpeakHeight:
260 rpeakHeight = peakHeight
263 regions[chrom][index].start = rstart
264 regions[chrom][index].stop = rstop
265 regions[chrom][index].length = abs(rstop - rstart)
267 regions[chrom][index].label = label
270 regions[chrom][index].peakPos = rpeakPos
271 regions[chrom][index].peakHeight = rpeakHeight
279 region = Region.Region(start, stop)
284 region.peakPos = peakPos
285 region.peakHeight = peakHeight
287 regions[chrom].append(region)
290 if verbose and (count % 100000 == 0):
294 for chrom in regions:
295 regionCount += len(regions[chrom])
296 regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
299 print "merged %d times" % mergeCount
300 print "returning %d regions" % regionCount
305 def regionsOverlap(start, stop, rstart, rstop):
307 (start, stop) = (stop, start)
310 (rstart, rstop) = (rstop, rstart)
312 return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
315 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
317 (start, stop) = (stop, start)
320 (rstart, rstop) = (rstop, rstart)
322 return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
325 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
326 shift=0, maxshift=75):
327 """ find the peak in a list of reads (hitlist) in a region
328 of a given length and absolute start point. returns a
329 list of peaks, the number of hits, a triangular-smoothed
330 version of hitlist, and the number of reads that are
331 forward (plus) sense.
332 If doWeight is True, weight the reads accordingly.
333 If leftPlus is True, return the number of plus reads left of
334 the peak, taken to be the first TopPos position.
338 shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
340 seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
342 # implementing a triangular smooth
343 smoothArray = array("f", [0.] * length)
344 #TODO: QForAli - really? We're going to insert MANUFACTURED data (0.0's) and use them in the calculation
345 # (probably) just to avoid IndexError exceptions. Then we are going to completely ignore adjusting the
346 # last 2 elements of the array for (probably) the same issue. Is this how erange is dealing with the
348 for pos in range(2,length -2):
349 smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
351 topPos = getPeakPositionList(smoothArray, length)
352 peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
357 for read in regionArray:
359 weight = read["weight"]
363 currentPos = read["start"] - start
364 if currentPos <= maxPos and read["sense"] == "+":
365 numLeftPlus += weight
367 peak.numLeftPlus = numLeftPlus
372 def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
374 lowestScore = 20000000000
375 for testShift in xrange(maxShift + 1):
376 shiftArray = array("f", [0.] * length)
377 for read in readList:
378 currentpos = read["start"] - start
379 if read["sense"] == "+":
384 currentpos += testShift * senseModifier
385 if (currentpos < 1) or (currentpos >= length):
389 weight = read["weight"]
393 shiftArray[currentpos] += weight * senseModifier
396 for score in shiftArray:
397 currentScore += abs(score)
399 if currentScore < lowestScore:
400 bestShift = testShift
401 lowestScore = currentScore
406 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
407 seqArray = array("f", [0.] * length)
412 currentpos = read["start"] - start
413 if read["sense"] == "+":
418 if (currentpos < 1 - readlen) or (currentpos >= length):
422 weight = read["weight"]
428 regionArray.append(read)
431 while currentpos < 0:
435 while hitIndex < readlen and currentpos < length:
436 seqArray[currentpos] += weight
440 if read["sense"] == "+":
443 return seqArray, regionArray, numHits, numPlus
446 def getPeakPositionList(smoothArray, length):
449 for currentpos in xrange(length):
450 if topNucleotide < smoothArray[currentpos]:
451 topNucleotide = smoothArray[currentpos]
452 peakList = [currentpos]
453 elif topNucleotide == smoothArray[currentpos]:
454 peakList.append(currentpos)
459 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
460 restrictList=[], regionComplement=False, maxStop=250000000):
461 """ return a dictionary of cistematic gene features. Requires
462 cistematic, obviously. Can filter-out pseudogenes. Will use
463 additional regions dict to supplement gene models, if available.
464 Can restrict output to a list of GIDs.
465 If regionComplement is set to true, returns the regions *outside* of the
466 calculated boundaries, which is useful for retrieving intronic and
467 intergenic regions. maxStop is simply used to define the uppermost
468 boundary of the complement region.
470 featuresDict = genomeObject.getallGeneFeatures()
472 if len(restrictList) > 0:
475 if len(additionalRegionsDict) > 0:
477 for chrom in additionalRegionsDict:
478 for region in additionalRegionsDict[chrom]:
480 if label not in sortList:
481 sortList.append(label)
483 if label not in featuresDict:
484 featuresDict[label] = []
487 sense = featuresDict[label][0][-1]
489 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
492 featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
494 featuresByChromDict = {}
495 for gid in featuresDict:
496 if restrictGID and gid not in restrictList:
499 newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
500 if newFeatureList and chrom not in featuresByChromDict:
501 featuresByChromDict[chrom] = []
503 for (start, stop, ftype) in newFeatureList:
504 featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
506 for chrom in featuresByChromDict:
507 featuresByChromDict[chrom].sort()
510 complementByChromDict = {}
512 for chrom in featuresByChromDict:
513 complementByChromDict[chrom] = []
514 listLength = len(featuresByChromDict[chrom])
517 for index in range(listLength):
518 currentStop = featuresByChromDict[chrom][index][0]
520 if currentStart < currentStop:
521 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
523 currentStart = featuresByChromDict[chrom][index][1]
525 currentStop = maxStop
526 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
528 return (featuresByChromDict, complementByChromDict)
530 return featuresByChromDict
533 def getNewFeatureList(featureList, ignorePseudo=False):
535 for (featureType, chrom, start, stop, sense) in featureList:
536 if featureType == "PSEUDO" and ignorePseudo:
537 return [], chrom, sense
539 if (start, stop, featureType) not in newFeatureList:
542 for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
543 if start >= newFeatureStart and stop <= newFeatureStop:
546 if start < newFeatureStart and stop > newFeatureStop:
547 containedList.append((newFeatureStart, newFeatureStop))
549 if len(containedList) > 0:
552 for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
553 if (newFeatureStart, newFeatureStop) not in containedList:
554 newFList.append((newFeatureStart, newFeatureStop, featureType2))
555 if start >= newFeatureStart and stop <= newFeatureStop:
558 newFeatureList = newFList
561 newFeatureList.append((start, stop, featureType))
563 return newFeatureList, chrom, sense
566 def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
567 additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
568 lengthCDS=0, keepSense=False, adjustToNeighbor=True):
569 """ return a dictionary of gene loci. Can be used to retrieve additional
570 sequence upstream or downstream of gene, up to the next gene. Requires
571 cistematic, obviously.
572 Can filter-out pseudogenes and use additional regions outside of existing
573 gene models. Use upstreamSpanTSS to overlap half of the upstream region
575 If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
576 lengthCDS < 0bp, return only the last X bp from CDS.
578 locusByChromDict = {}
579 if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
580 return locusByChromDict
582 genomeName = genome.genome
583 featuresDict = getGeneFeatures(genome, additionalRegionsDict)
584 for gid in featuresDict:
585 featureList = featuresDict[gid]
587 for (ftype, chrom, start, stop, sense) in featureList:
588 newFeatureList.append((start, stop))
590 if ignorePseudo and ftype == "PSEUDO":
593 newFeatureList.sort()
595 sense = featureList[0][-1]
596 gstart = newFeatureList[0][0]
597 gstop = newFeatureList[-1][1]
598 glen = abs(gstart - gstop)
600 if not useCDS and upstream > 0:
602 if gstop > (gstart + upstream / 2):
603 gstop = gstart + upstream / 2
606 elif not useCDS and downstream > 0:
611 distance = upstream / 2
616 nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
617 if nextGene < distance * 2:
618 distance = nextGene / 2
620 distance = max(distance, 1)
624 distance = downstream
626 nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
627 if nextGene < downstream * 2:
628 distance = nextGene / 2
630 distance = max(distance, 1)
633 if 0 < lengthCDS < glen:
634 gstop = newFeatureList[0][0] + lengthCDS
636 if lengthCDS < 0 and abs(lengthCDS) < glen:
637 gstart = newFeatureList[-1][1] + lengthCDS
639 if not useCDS and upstream > 0:
641 if gstart < (gstop - upstream / 2):
642 gstart = gstop - upstream / 2
645 elif not useCDS and downstream > 0:
650 distance = upstream /2
655 nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
656 if nextGene < distance * 2:
657 distance = nextGene / 2
659 distance = max(distance, 1)
663 distance = downstream
665 nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
666 if nextGene < downstream * 2:
667 distance = nextGene / 2
669 distance = max(distance, 1)
672 if 0 < lengthCDS < glen:
673 gstart = newFeatureList[-1][-1] - lengthCDS
675 if lengthCDS < 0 and abs(lengthCDS) < glen:
676 gstop = newFeatureList[0][0] - lengthCDS
678 if chrom not in locusByChromDict:
679 locusByChromDict[chrom] = []
682 locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
684 locusByChromDict[chrom].append((gstart, gstop, gid, glen))
686 for chrom in locusByChromDict:
687 locusByChromDict[chrom].sort()
689 return locusByChromDict
692 def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
693 if upstream == 0 and downstream == 0 and not useCDS:
694 print "getLocusByChromDict: asked for no sequence - returning empty dict"
696 elif upstream > 0 and downstream > 0 and not useCDS:
697 print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
699 elif lengthCDS != 0 and not useCDS:
700 print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
702 elif upstreamSpanTSS and lengthCDS != 0:
703 print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
705 elif lengthCDS > 0 and downstream > 0:
706 print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
708 elif lengthCDS < 0 and upstream > 0:
709 print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
715 def getGeneFeatures(genome, additionalRegionsDict):
716 featuresDict = genome.getallGeneFeatures()
717 if len(additionalRegionsDict) > 0:
719 for chrom in additionalRegionsDict:
720 for region in additionalRegionsDict[chrom]:
722 if label not in sortList:
723 sortList.append(label)
725 if label not in featuresDict:
726 featuresDict[label] = []
729 sense = featuresDict[label][0][-1]
731 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
734 featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
739 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
740 normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
742 """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
743 a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
744 or a different weight / tag.
750 if defaultRegionFormat:
763 print "entering computeRegionBins"
764 if len(regionList) > 0:
765 for readID in regionList:
766 regionsBins[readID] = [0.] * bins
768 for chrom in regionsByChromDict:
769 for regionTuple in regionsByChromDict[chrom]:
770 regionID = regionTuple[regionIDField]
771 regionsBins[regionID] = [0.] * bins
773 for chrom in hitDict:
774 if chrom not in regionsByChromDict:
777 for regionTuple in regionsByChromDict[chrom]:
778 regionID = regionTuple[regionIDField]
779 regionsLen[regionID] = regionTuple[lengthField]
783 for read in hitDict[chrom]:
784 tagStart = read["start"]
785 weight = read["weight"]
787 if index % 100000 == 0:
788 print "read %d " % index,
790 stopPoint = tagStart + readlen
794 for regionTuple in regionsByChromDict[chrom][startRegion:]:
795 start = regionTuple[startField]
796 stop = regionTuple[stopField]
797 regionID = regionTuple[regionIDField]
798 rlen = regionTuple[lengthField]
800 rsense = regionTuple[senseField]
808 if start > stopPoint:
812 if start <= tagStart <= stop:
814 regionBinLength = rlen / bins
816 regionBinLength = binLength
819 positionInRegion = tagStart - start
821 positionInRegion = start + rlen - tagStart
823 # we are relying on python's integer division quirk
824 binID = positionInRegion / regionBinLength
825 if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
827 elif fixedFirstBin > 0:
834 regionsBins[regionID][binID] += normalizedTag * weight
836 print "%s %s" % (regionID, str(binID))
840 return (regionsBins, regionsLen)