9 from time import strftime
10 from array import array
11 from collections import defaultdict
13 from cistematic.core.geneinfo import geneinfoDB
14 from cistematic.genomes import Genome
17 commoncodeVersion = 5.6
18 currentRDSversion = 2.0
21 def getReverseComplement(base):
32 def countDuplicatesInList(listToCheck):
33 tally = defaultdict(int)
34 for item in listToCheck:
40 def writeLog(logFile, messenger, message):
41 """ create a log file to write a message from a messenger or append to an existing file.
44 logfile = open(logFile)
46 logfile = open(logFile, "w")
48 logfile = open(logFile, "a")
50 logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
54 def getGeneInfoDict(genome, cache=False):
55 idb = geneinfoDB(cache=cache)
56 if genome == "dmelanogaster":
57 geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
59 geneinfoDict = idb.getallGeneInfo(genome)
64 def getGeneAnnotDict(genome, inRAM=False):
65 return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
68 def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False):
69 hg = Genome(genome, inRAM=inRAM)
70 if extendGenome != "":
71 hg.extendFeatures(extendGenome, replace=replaceModels)
73 geneannotDict = hg.allAnnotInfo()
78 def getConfigParser(fileList=[]):
79 configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
80 for filename in fileList:
81 configFiles.append(filename)
83 config = ConfigParser.SafeConfigParser()
84 config.read(configFiles)
89 def getConfigOption(parser, section, option, default=None):
91 setting = parser.get(section, option)
92 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
98 def getConfigIntOption(parser, section, option, default=None):
100 setting = parser.getint(section, option)
101 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
107 def getConfigFloatOption(parser, section, option, default=None):
109 setting = parser.getfloat(section, option)
110 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
116 def getConfigBoolOption(parser, section, option, default=None):
118 setting = parser.getboolean(section, option)
119 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
125 def getAllConfigSectionOptions(parser, section):
127 setting = parser.items(section)
128 except ConfigParser.NoSectionError:
134 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
135 fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
136 doMerge=True, keepPeak=False, returnTop=0):
138 """ returns a dictionary containing a list of merged overlapping regions by chromosome;
139 can optionally filter regions that have a scoreField fewer than minHits.
140 Can also optionally return the label of each region, as well as the
141 peak, if supplied (peakPos and peakHeight should be the last 2 fields).
142 Can return the top regions based on score if higher than minHits.
144 infile = open(regionfilename)
145 lines = infile.readlines()
146 regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
147 fullChrom, chromField, scoreField, pad, compact,
148 doMerge, keepPeak, returnTop)
155 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
156 fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
157 doMerge=True, keepPeak=False, returnTop=0):
158 """ returns a dictionary containing a list of merged overlapping regions by chromosome;
159 can optionally filter regions that have a scoreField fewer than minHits.
160 Can also optionally return the label of each region, as well as the
161 peak, if supplied (peakPos and peakHeight should be the last 2 fields).
162 Can return the top regions based on score if higher than minHits.
167 if 0 < returnTop < len(regionList):
169 for regionEntry in regionList:
170 if regionEntry[0] == "#":
171 if "pvalue" in regionEntry:
174 if "readShift" in regionEntry:
179 fields = regionEntry.strip().split("\t")
180 hits = float(fields[scoreField].strip())
184 returnTop = -1 * returnTop
185 minScore = scores[returnTop]
186 if minScore > minHits:
190 chromField = int(chromField)
192 #TODO: Current algorithm processes input file line by line and compares with prior lines. Problem is it
193 # exits at the first merge. This is not a problem when the input is sorted by start position, but in
194 # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
195 # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will
196 # be merged with A but that will exit the loop and the merge with C will be missed.
197 for regionEntry in regionList:
198 if regionEntry[0] == "#":
199 if "pvalue" in regionEntry:
202 if "readShift" in regionEntry:
207 fields = regionEntry.strip().split("\t")
210 hits = float(fields[scoreField].strip())
211 except (IndexError, ValueError):
218 (chrom, pos) = fields[chromField].split(":")
219 (front, back) = pos.split("-")
223 label = string.join(fields[:chromField],"\t")
224 chrom = fields[chromField]
225 start = int(fields[chromField + 1]) - pad
226 stop = int(fields[chromField + 2]) + pad
230 start = int(fields[2]) - pad
231 stop = int(fields[3]) + pad
237 peakPos = int(fields[-2 - hasPvalue - hasShift])
238 peakHeight = float(fields[-1 - hasPvalue - hasShift])
240 if chrom not in regions:
245 if doMerge and len(regions[chrom]) > 0:
246 for index in range(len(regions[chrom])):
247 region = regions[chrom][index]
248 rstart = region.start
250 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
258 rpeakPos = region.peakPos
259 rpeakHeight = region.peakHeight
260 if peakHeight > rpeakHeight:
261 rpeakHeight = peakHeight
264 regions[chrom][index].start = rstart
265 regions[chrom][index].stop = rstop
266 regions[chrom][index].length = abs(rstop - rstart)
268 regions[chrom][index].label = label
271 regions[chrom][index].peakPos = rpeakPos
272 regions[chrom][index].peakHeight = rpeakHeight
280 region = Region.Region(start, stop)
285 region.peakPos = peakPos
286 region.peakHeight = peakHeight
288 regions[chrom].append(region)
291 if verbose and (count % 100000 == 0):
295 for chrom in regions:
296 regionCount += len(regions[chrom])
297 regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
300 print "merged %d times" % mergeCount
301 print "returning %d regions" % regionCount
306 def regionsOverlap(start, stop, rstart, rstop):
308 (start, stop) = (stop, start)
311 (rstart, rstop) = (rstop, rstart)
313 return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
316 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
318 (start, stop) = (stop, start)
321 (rstart, rstop) = (rstop, rstart)
323 return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
326 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
327 shift=0, maxshift=75):
328 """ find the peak in a list of reads (hitlist) in a region
329 of a given length and absolute start point. returns a
330 list of peaks, the number of hits, a triangular-smoothed
331 version of hitlist, and the number of reads that are
332 forward (plus) sense.
333 If doWeight is True, weight the reads accordingly.
334 If leftPlus is True, return the number of plus reads left of
335 the peak, taken to be the first TopPos position.
339 shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
341 seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
343 # implementing a triangular smooth
344 smoothArray = array("f", [0.] * length)
345 for pos in range(2,length -2):
346 smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
348 topPos = getPeakPositionList(smoothArray, length)
349 peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
354 for read in regionArray:
356 weight = read["weight"]
360 currentPos = read["start"] - start
361 if currentPos <= maxPos and read["sense"] == "+":
362 numLeftPlus += weight
364 peak.numLeftPlus = numLeftPlus
369 def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
371 lowestScore = 20000000000
372 for testShift in xrange(maxShift + 1):
373 shiftArray = array("f", [0.] * length)
375 currentpos = read["start"] - start
376 if read["sense"] == "+":
377 currentpos += testShift
379 currentpos -= testShift
381 if (currentpos < 1) or (currentpos >= length):
385 weight = read["weight"]
389 if read["sense"] == "+":
390 shiftArray[currentpos] += weight
392 shiftArray[currentpos] -= weight
395 for score in shiftArray:
396 currentScore += abs(score)
399 if currentScore < lowestScore:
400 bestShift = testShift
401 lowestScore = currentScore
406 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
407 seqArray = array("f", [0.] * length)
412 currentpos = read["start"] - start
413 if read["sense"] == "+":
418 if (currentpos < 1 - readlen) or (currentpos >= length):
422 weight = read["weight"]
428 regionArray.append(read)
431 while currentpos < 0:
435 while hitIndex < readlen and currentpos < length:
436 seqArray[currentpos] += weight
440 if read["sense"] == "+":
443 return seqArray, regionArray, numHits, numPlus
446 def getPeakPositionList(smoothArray, length):
449 for currentpos in xrange(length):
450 if topNucleotide < smoothArray[currentpos]:
451 topNucleotide = smoothArray[currentpos]
452 peakList = [currentpos]
453 elif topNucleotide == smoothArray[currentpos]:
454 peakList.append(currentpos)
459 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
460 restrictList=[], regionComplement=False, maxStop=250000000):
461 """ return a dictionary of cistematic gene features. Requires
462 cistematic, obviously. Can filter-out pseudogenes. Will use
463 additional regions dict to supplement gene models, if available.
464 Can restrict output to a list of GIDs.
465 If regionComplement is set to true, returns the regions *outside* of the
466 calculated boundaries, which is useful for retrieving intronic and
467 intergenic regions. maxStop is simply used to define the uppermost
468 boundary of the complement region.
470 featuresDict = genomeObject.getallGeneFeatures()
472 if len(restrictList) > 0:
475 if len(additionalRegionsDict) > 0:
477 for chrom in additionalRegionsDict:
478 for region in additionalRegionsDict[chrom]:
480 if label not in sortList:
481 sortList.append(label)
483 if label not in featuresDict:
484 featuresDict[label] = []
487 sense = featuresDict[label][0][-1]
489 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
492 featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
494 featuresByChromDict = {}
495 for gid in featuresDict:
496 if restrictGID and gid not in restrictList:
499 featureList = featuresDict[gid]
502 for (ftype, chrom, start, stop, sense) in featureList:
503 if ftype == "PSEUDO":
506 if (start, stop, ftype) not in newFeatureList:
509 for (fstart, fstop, ftype2) in newFeatureList:
510 if start >= fstart and stop <= fstop:
513 if start < fstart and stop > fstop:
514 containedList.append((fstart, fstop))
516 if len(containedList) > 0:
519 for (fstart, fstop, ftype2) in newFeatureList:
520 if (fstart, fstop) not in containedList:
521 newFList.append((fstart, fstop, ftype2))
522 if start >= fstart and stop <= fstop:
525 newFeatureList = newFList
527 newFeatureList.append((start, stop, ftype))
529 if ignorePseudo and isPseudo:
532 if chrom not in featuresByChromDict:
533 featuresByChromDict[chrom] = []
535 for (start, stop, ftype) in newFeatureList:
536 featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
538 for chrom in featuresByChromDict:
539 featuresByChromDict[chrom].sort()
542 complementByChromDict = {}
544 for chrom in featuresByChromDict:
545 complementByChromDict[chrom] = []
546 listLength = len(featuresByChromDict[chrom])
549 for index in range(listLength):
550 currentStop = featuresByChromDict[chrom][index][0]
552 if currentStart < currentStop:
553 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
555 currentStart = featuresByChromDict[chrom][index][1]
557 currentStop = maxStop
558 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
560 return (featuresByChromDict, complementByChromDict)
562 return featuresByChromDict
565 def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
566 additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
567 lengthCDS=0, keepSense=False, adjustToNeighbor=True):
568 """ return a dictionary of gene loci. Can be used to retrieve additional
569 sequence upstream or downstream of gene, up to the next gene. Requires
570 cistematic, obviously.
571 Can filter-out pseudogenes and use additional regions outside of existing
572 gene models. Use upstreamSpanTSS to overlap half of the upstream region
574 If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
575 lengthCDS < 0bp, return only the last X bp from CDS.
577 locusByChromDict = {}
578 if upstream == 0 and downstream == 0 and not useCDS:
579 print "getLocusByChromDict: asked for no sequence - returning empty dict"
580 return locusByChromDict
581 elif upstream > 0 and downstream > 0 and not useCDS:
582 print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
583 return locusByChromDict
584 elif lengthCDS != 0 and not useCDS:
585 print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
586 return locusByChromDict
587 elif upstreamSpanTSS and lengthCDS != 0:
588 print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
589 return locusByChromDict
590 elif lengthCDS > 0 and downstream > 0:
591 print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
592 return locusByChromDict
593 elif lengthCDS < 0 and upstream > 0:
594 print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
595 return locusByChromDict
597 genome = genomeObject.genome
598 featuresDict = genomeObject.getallGeneFeatures()
599 if len(additionalRegionsDict) > 0:
601 for chrom in additionalRegionsDict:
602 for region in additionalRegionsDict[chrom]:
604 if label not in sortList:
605 sortList.append(label)
607 if label not in featuresDict:
608 featuresDict[label] = []
611 sense = featuresDict[label][0][-1]
613 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
616 featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
618 for gid in featuresDict:
619 featureList = featuresDict[gid]
621 for (ftype, chrom, start, stop, sense) in featureList:
622 newFeatureList.append((start, stop))
624 if ignorePseudo and ftype == "PSEUDO":
627 newFeatureList.sort()
629 sense = featureList[0][-1]
630 gstart = newFeatureList[0][0]
631 gstop = newFeatureList[-1][1]
632 glen = abs(gstart - gstop)
634 if not useCDS and upstream > 0:
636 if gstop > (gstart + upstream / 2):
637 gstop = gstart + upstream / 2
640 elif not useCDS and downstream > 0:
645 distance = upstream / 2
650 nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2)
651 if nextGene < distance * 2:
652 distance = nextGene / 2
660 distance = downstream
662 nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2)
663 if nextGene < downstream * 2:
664 distance = nextGene / 2
673 gstop = newFeatureList[0][0] + lengthCDS
676 if abs(lengthCDS) < glen:
677 gstart = newFeatureList[-1][1] + lengthCDS
679 if not useCDS and upstream > 0:
681 if gstart < (gstop - upstream / 2):
682 gstart = gstop - upstream / 2
685 elif not useCDS and downstream > 0:
690 distance = upstream /2
695 nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2)
696 if nextGene < distance * 2:
697 distance = nextGene / 2
705 distance = downstream
707 nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2)
708 if nextGene < downstream * 2:
709 distance = nextGene / 2
718 gstart = newFeatureList[-1][-1] - lengthCDS
721 if abs(lengthCDS) < glen:
722 gstop = newFeatureList[0][0] - lengthCDS
724 glen = abs(gstop - gstart)
725 if chrom not in locusByChromDict:
726 locusByChromDict[chrom] = []
729 locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
731 locusByChromDict[chrom].append((gstart, gstop, gid, glen))
733 for chrom in locusByChromDict:
734 locusByChromDict[chrom].sort()
736 return locusByChromDict
739 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
740 normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
742 """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
743 a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
744 or a different weight / tag.
750 if defaultRegionFormat:
763 print "entering computeRegionBins"
764 if len(regionList) > 0:
765 for readID in regionList:
766 regionsBins[readID] = [0.] * bins
768 for chrom in regionsByChromDict:
769 for regionTuple in regionsByChromDict[chrom]:
770 regionID = regionTuple[regionIDField]
771 regionsBins[regionID] = [0.] * bins
773 for chrom in hitDict:
774 if chrom not in regionsByChromDict:
777 for regionTuple in regionsByChromDict[chrom]:
778 regionID = regionTuple[regionIDField]
779 regionsLen[regionID] = regionTuple[lengthField]
783 for read in hitDict[chrom]:
784 tagStart = read["start"]
785 weight = read["weight"]
787 if index % 100000 == 0:
788 print "read %d " % index,
790 stopPoint = tagStart + readlen
794 for regionTuple in regionsByChromDict[chrom][startRegion:]:
795 start = regionTuple[startField]
796 stop = regionTuple[stopField]
797 regionID = regionTuple[regionIDField]
798 rlen = regionTuple[lengthField]
800 rsense = regionTuple[senseField]
808 if start > stopPoint:
812 if start <= tagStart <= stop:
814 regionBinLength = rlen / bins
816 regionBinLength = binLength
818 startdist = tagStart - start
820 # we are relying on python's integer division quirk
821 binID = startdist / regionBinLength
822 if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
824 elif fixedFirstBin > 0:
831 regionsBins[regionID][binID] += normalizedTag * weight
833 print "%s %s" % (regionID, str(binID))
835 rdist = rlen - startdist
836 binID = rdist / regionBinLength
837 if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
839 elif fixedFirstBin > 0:
846 regionsBins[regionID][binID] += normalizedTag * weight
848 print "%s %s" % (regionID, str(binID))
852 return (regionsBins, regionsLen)