9 from time import strftime
10 from array import array
11 from collections import defaultdict
13 from cistematic.core.geneinfo import geneinfoDB
14 from cistematic.genomes import Genome
17 commoncodeVersion = 5.6
18 currentRDSversion = 2.0
20 class ErangeError(Exception):
24 def getReverseComplement(base):
35 def countDuplicatesInList(listToCheck):
36 tally = defaultdict(int)
37 for item in listToCheck:
43 def writeLog(logFile, messenger, message):
44 """ create a log file to write a message from a messenger or append to an existing file.
47 logfile = open(logFile)
49 logfile = open(logFile, "w")
51 logfile = open(logFile, "a")
53 logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
57 def getGeneInfoDict(genome, cache=False):
58 idb = geneinfoDB(cache=cache)
59 if genome == "dmelanogaster":
60 geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
62 geneinfoDict = idb.getallGeneInfo(genome)
67 def getGeneAnnotDict(genome, inRAM=False):
68 return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
71 def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
72 genome = Genome(genomeName, inRAM=inRAM)
73 if extendGenome != "":
74 genome.extendFeatures(extendGenome, replace=replaceModels)
76 geneannotDict = genome.allAnnotInfo()
81 def getConfigParser(fileList=[]):
82 configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
83 for filename in fileList:
84 configFiles.append(filename)
86 config = ConfigParser.SafeConfigParser()
87 config.read(configFiles)
92 def getConfigOption(parser, section, option, default=None):
94 setting = parser.get(section, option)
95 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
101 def getConfigIntOption(parser, section, option, default=None):
103 setting = parser.getint(section, option)
104 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
110 def getConfigFloatOption(parser, section, option, default=None):
112 setting = parser.getfloat(section, option)
113 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
119 def getConfigBoolOption(parser, section, option, default=None):
121 setting = parser.getboolean(section, option)
122 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
128 def getAllConfigSectionOptions(parser, section):
130 setting = parser.items(section)
131 except ConfigParser.NoSectionError:
137 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
138 fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
139 doMerge=True, keepPeak=False, returnTop=0):
141 """ returns a dictionary containing a list of merged overlapping regions by chromosome;
142 can optionally filter regions that have a scoreField fewer than minHits.
143 Can also optionally return the label of each region, as well as the
144 peak, if supplied (peakPos and peakHeight should be the last 2 fields).
145 Can return the top regions based on score if higher than minHits.
147 infile = open(regionfilename)
148 lines = infile.readlines()
149 regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
150 fullChrom, chromField, scoreField, pad, compact,
151 doMerge, keepPeak, returnTop)
158 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
159 fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
160 doMerge=True, keepPeak=False, returnTop=0):
161 """ returns a dictionary containing a list of merged overlapping regions by chromosome;
162 can optionally filter regions that have a scoreField fewer than minHits.
163 Can also optionally return the label of each region, as well as the
164 peak, if supplied (peakPos and peakHeight should be the last 2 fields).
165 Can return the top regions based on score if higher than minHits.
170 if 0 < returnTop < len(regionList):
172 for regionEntry in regionList:
173 if regionEntry[0] == "#":
174 if "pvalue" in regionEntry:
177 if "readShift" in regionEntry:
182 fields = regionEntry.strip().split("\t")
183 hits = float(fields[scoreField].strip())
187 returnTop = -1 * returnTop
188 minScore = scores[returnTop]
189 if minScore > minHits:
193 chromField = int(chromField)
195 for regionEntry in regionList:
196 if regionEntry[0] == "#":
197 if "pvalue" in regionEntry:
200 if "readShift" in regionEntry:
205 fields = regionEntry.strip().split("\t")
208 hits = float(fields[scoreField].strip())
209 except (IndexError, ValueError):
216 (chrom, pos) = fields[chromField].split(":")
217 (front, back) = pos.split("-")
221 label = string.join(fields[:chromField],"\t")
222 chrom = fields[chromField]
223 start = int(fields[chromField + 1]) - pad
224 stop = int(fields[chromField + 2]) + pad
228 start = int(fields[2]) - pad
229 stop = int(fields[3]) + pad
235 peakPos = int(fields[-2 - hasPvalue - hasShift])
236 peakHeight = float(fields[-1 - hasPvalue - hasShift])
238 if chrom not in regions:
243 if doMerge and len(regions[chrom]) > 0:
244 for index in range(len(regions[chrom])):
245 region = regions[chrom][index]
246 rstart = region.start
248 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
256 rpeakPos = region.peakPos
257 rpeakHeight = region.peakHeight
258 if peakHeight > rpeakHeight:
259 rpeakHeight = peakHeight
262 regions[chrom][index].start = rstart
263 regions[chrom][index].stop = rstop
264 regions[chrom][index].length = abs(rstop - rstart)
266 regions[chrom][index].label = label
269 regions[chrom][index].peakPos = rpeakPos
270 regions[chrom][index].peakHeight = rpeakHeight
278 region = Region.Region(start, stop)
283 region.peakPos = peakPos
284 region.peakHeight = peakHeight
286 regions[chrom].append(region)
289 if verbose and (count % 100000 == 0):
293 for chrom in regions:
294 regionCount += len(regions[chrom])
295 regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
298 print "merged %d times" % mergeCount
299 print "returning %d regions" % regionCount
304 def regionsOverlap(start, stop, rstart, rstop):
306 (start, stop) = (stop, start)
309 (rstart, rstop) = (rstop, rstart)
311 return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
314 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
316 (start, stop) = (stop, start)
319 (rstart, rstop) = (rstop, rstart)
321 return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
324 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
325 shift=0, maxshift=75):
326 """ find the peak in a list of reads (hitlist) in a region
327 of a given length and absolute start point. returns a
328 list of peaks, the number of hits, a triangular-smoothed
329 version of hitlist, and the number of reads that are
330 forward (plus) sense.
331 If doWeight is True, weight the reads accordingly.
332 If leftPlus is True, return the number of plus reads left of
333 the peak, taken to be the first TopPos position.
337 shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
339 seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
341 # implementing a triangular smooth
342 smoothArray = array("f", [0.] * length)
343 for pos in range(2,length -2):
344 smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
346 topPos = getPeakPositionList(smoothArray, length)
347 peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
352 for read in regionArray:
354 weight = read["weight"]
358 currentPos = read["start"] - start
359 if currentPos <= maxPos and read["sense"] == "+":
360 numLeftPlus += weight
362 peak.numLeftPlus = numLeftPlus
367 def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
369 lowestScore = 20000000000
370 for testShift in xrange(maxShift + 1):
371 shiftArray = array("f", [0.] * length)
372 for read in readList:
373 currentpos = read["start"] - start
374 if read["sense"] == "+":
375 currentpos += testShift
377 currentpos -= testShift
379 if (currentpos < 1) or (currentpos >= length):
383 weight = read["weight"]
387 if read["sense"] == "+":
388 shiftArray[currentpos] += weight
390 shiftArray[currentpos] -= weight
393 for score in shiftArray:
394 currentScore += abs(score)
397 if currentScore < lowestScore:
398 bestShift = testShift
399 lowestScore = currentScore
404 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
405 seqArray = array("f", [0.] * length)
410 currentpos = read["start"] - start
411 if read["sense"] == "+":
416 if (currentpos < 1 - readlen) or (currentpos >= length):
420 weight = read["weight"]
426 regionArray.append(read)
429 while currentpos < 0:
433 while hitIndex < readlen and currentpos < length:
434 seqArray[currentpos] += weight
438 if read["sense"] == "+":
441 return seqArray, regionArray, numHits, numPlus
444 def getPeakPositionList(smoothArray, length):
447 for currentpos in xrange(length):
448 if topNucleotide < smoothArray[currentpos]:
449 topNucleotide = smoothArray[currentpos]
450 peakList = [currentpos]
451 elif topNucleotide == smoothArray[currentpos]:
452 peakList.append(currentpos)
457 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
458 restrictList=[], regionComplement=False, maxStop=250000000):
459 """ return a dictionary of cistematic gene features. Requires
460 cistematic, obviously. Can filter-out pseudogenes. Will use
461 additional regions dict to supplement gene models, if available.
462 Can restrict output to a list of GIDs.
463 If regionComplement is set to true, returns the regions *outside* of the
464 calculated boundaries, which is useful for retrieving intronic and
465 intergenic regions. maxStop is simply used to define the uppermost
466 boundary of the complement region.
468 featuresDict = genomeObject.getallGeneFeatures()
470 if len(restrictList) > 0:
473 if len(additionalRegionsDict) > 0:
475 for chrom in additionalRegionsDict:
476 for region in additionalRegionsDict[chrom]:
478 if label not in sortList:
479 sortList.append(label)
481 if label not in featuresDict:
482 featuresDict[label] = []
485 sense = featuresDict[label][0][-1]
487 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
490 featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
492 featuresByChromDict = {}
493 for gid in featuresDict:
494 if restrictGID and gid not in restrictList:
497 featureList = featuresDict[gid]
500 for (ftype, chrom, start, stop, sense) in featureList:
501 if ftype == "PSEUDO":
504 if (start, stop, ftype) not in newFeatureList:
507 for (fstart, fstop, ftype2) in newFeatureList:
508 if start >= fstart and stop <= fstop:
511 if start < fstart and stop > fstop:
512 containedList.append((fstart, fstop))
514 if len(containedList) > 0:
517 for (fstart, fstop, ftype2) in newFeatureList:
518 if (fstart, fstop) not in containedList:
519 newFList.append((fstart, fstop, ftype2))
520 if start >= fstart and stop <= fstop:
523 newFeatureList = newFList
525 newFeatureList.append((start, stop, ftype))
527 if ignorePseudo and isPseudo:
530 if chrom not in featuresByChromDict:
531 featuresByChromDict[chrom] = []
533 for (start, stop, ftype) in newFeatureList:
534 featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
536 for chrom in featuresByChromDict:
537 featuresByChromDict[chrom].sort()
540 complementByChromDict = {}
542 for chrom in featuresByChromDict:
543 complementByChromDict[chrom] = []
544 listLength = len(featuresByChromDict[chrom])
547 for index in range(listLength):
548 currentStop = featuresByChromDict[chrom][index][0]
550 if currentStart < currentStop:
551 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
553 currentStart = featuresByChromDict[chrom][index][1]
555 currentStop = maxStop
556 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
558 return (featuresByChromDict, complementByChromDict)
560 return featuresByChromDict
563 def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
564 additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
565 lengthCDS=0, keepSense=False, adjustToNeighbor=True):
566 """ return a dictionary of gene loci. Can be used to retrieve additional
567 sequence upstream or downstream of gene, up to the next gene. Requires
568 cistematic, obviously.
569 Can filter-out pseudogenes and use additional regions outside of existing
570 gene models. Use upstreamSpanTSS to overlap half of the upstream region
572 If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
573 lengthCDS < 0bp, return only the last X bp from CDS.
575 locusByChromDict = {}
576 if upstream == 0 and downstream == 0 and not useCDS:
577 print "getLocusByChromDict: asked for no sequence - returning empty dict"
578 return locusByChromDict
579 elif upstream > 0 and downstream > 0 and not useCDS:
580 print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
581 return locusByChromDict
582 elif lengthCDS != 0 and not useCDS:
583 print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
584 return locusByChromDict
585 elif upstreamSpanTSS and lengthCDS != 0:
586 print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
587 return locusByChromDict
588 elif lengthCDS > 0 and downstream > 0:
589 print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
590 return locusByChromDict
591 elif lengthCDS < 0 and upstream > 0:
592 print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
593 return locusByChromDict
595 genomeName = genome.genome
596 featuresDict = getGeneFeatures(genome, additionalRegionsDict)
597 for gid in featuresDict:
598 featureList = featuresDict[gid]
600 for (ftype, chrom, start, stop, sense) in featureList:
601 newFeatureList.append((start, stop))
603 if ignorePseudo and ftype == "PSEUDO":
606 newFeatureList.sort()
608 sense = featureList[0][-1]
609 gstart = newFeatureList[0][0]
610 gstop = newFeatureList[-1][1]
611 glen = abs(gstart - gstop)
613 if not useCDS and upstream > 0:
615 if gstop > (gstart + upstream / 2):
616 gstop = gstart + upstream / 2
619 elif not useCDS and downstream > 0:
624 distance = upstream / 2
629 nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
630 if nextGene < distance * 2:
631 distance = nextGene / 2
633 distance = max(distance, 1)
637 distance = downstream
639 nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
640 if nextGene < downstream * 2:
641 distance = nextGene / 2
643 distance = max(distance, 1)
646 if 0 < lengthCDS < glen:
647 gstop = newFeatureList[0][0] + lengthCDS
649 if lengthCDS < 0 and abs(lengthCDS) < glen:
650 gstart = newFeatureList[-1][1] + lengthCDS
652 if not useCDS and upstream > 0:
654 if gstart < (gstop - upstream / 2):
655 gstart = gstop - upstream / 2
658 elif not useCDS and downstream > 0:
663 distance = upstream /2
668 nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
669 if nextGene < distance * 2:
670 distance = nextGene / 2
672 distance = max(distance, 1)
676 distance = downstream
678 nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
679 if nextGene < downstream * 2:
680 distance = nextGene / 2
682 distance = max(distance, 1)
685 if 0 < lengthCDS < glen:
686 gstart = newFeatureList[-1][-1] - lengthCDS
688 if lengthCDS < 0 and abs(lengthCDS) < glen:
689 gstop = newFeatureList[0][0] - lengthCDS
691 if chrom not in locusByChromDict:
692 locusByChromDict[chrom] = []
695 locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
697 locusByChromDict[chrom].append((gstart, gstop, gid, glen))
699 for chrom in locusByChromDict:
700 locusByChromDict[chrom].sort()
702 return locusByChromDict
705 def getGeneFeatures(genome, additionalRegionsDict):
706 featuresDict = genome.getallGeneFeatures()
707 if len(additionalRegionsDict) > 0:
709 for chrom in additionalRegionsDict:
710 for region in additionalRegionsDict[chrom]:
712 if label not in sortList:
713 sortList.append(label)
715 if label not in featuresDict:
716 featuresDict[label] = []
719 sense = featuresDict[label][0][-1]
721 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
724 featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
729 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
730 normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
732 """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
733 a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
734 or a different weight / tag.
740 if defaultRegionFormat:
753 print "entering computeRegionBins"
754 if len(regionList) > 0:
755 for readID in regionList:
756 regionsBins[readID] = [0.] * bins
758 for chrom in regionsByChromDict:
759 for regionTuple in regionsByChromDict[chrom]:
760 regionID = regionTuple[regionIDField]
761 regionsBins[regionID] = [0.] * bins
763 for chrom in hitDict:
764 if chrom not in regionsByChromDict:
767 for regionTuple in regionsByChromDict[chrom]:
768 regionID = regionTuple[regionIDField]
769 regionsLen[regionID] = regionTuple[lengthField]
773 for read in hitDict[chrom]:
774 tagStart = read["start"]
775 weight = read["weight"]
777 if index % 100000 == 0:
778 print "read %d " % index,
780 stopPoint = tagStart + readlen
784 for regionTuple in regionsByChromDict[chrom][startRegion:]:
785 start = regionTuple[startField]
786 stop = regionTuple[stopField]
787 regionID = regionTuple[regionIDField]
788 rlen = regionTuple[lengthField]
790 rsense = regionTuple[senseField]
798 if start > stopPoint:
802 if start <= tagStart <= stop:
804 regionBinLength = rlen / bins
806 regionBinLength = binLength
808 startdist = tagStart - start
810 # we are relying on python's integer division quirk
811 binID = startdist / regionBinLength
812 if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
814 elif fixedFirstBin > 0:
821 regionsBins[regionID][binID] += normalizedTag * weight
823 print "%s %s" % (regionID, str(binID))
825 rdist = rlen - startdist
826 binID = rdist / regionBinLength
827 if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
829 elif fixedFirstBin > 0:
836 regionsBins[regionID][binID] += normalizedTag * weight
838 print "%s %s" % (regionID, str(binID))
842 return (regionsBins, regionsLen)