4 from time import strftime
5 from array import array
6 from collections import defaultdict
8 from cistematic.core.geneinfo import geneinfoDB
9 from cistematic.genomes import Genome
12 commoncodeVersion = 5.6
13 currentRDSversion = 2.0
15 class ErangeError(Exception):
19 def getReverseComplement(base):
30 def countDuplicatesInList(listToCheck):
31 tally = defaultdict(int)
32 for item in listToCheck:
38 def writeLog(logFile, messenger, message):
39 """ create a log file to write a message from a messenger or append to an existing file.
42 logfile = open(logFile)
44 logfile = open(logFile, "w")
46 logfile = open(logFile, "a")
48 logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
52 def getGeneInfoDict(genome, cache=False):
53 idb = geneinfoDB(cache=cache)
54 if genome == "dmelanogaster":
55 geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
57 geneinfoDict = idb.getallGeneInfo(genome)
62 def getGeneAnnotDict(genome, inRAM=False):
63 return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
66 def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
67 genome = Genome(genomeName, inRAM=inRAM)
68 if extendGenome != "":
69 genome.extendFeatures(extendGenome, replace=replaceModels)
71 geneannotDict = genome.allAnnotInfo()
76 def getConfigParser(fileList=[]):
77 configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
78 for filename in fileList:
79 configFiles.append(filename)
81 config = ConfigParser.SafeConfigParser()
82 config.read(configFiles)
87 def getConfigOption(parser, section, option, default=None):
89 setting = parser.get(section, option)
90 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
96 def getConfigIntOption(parser, section, option, default=None):
98 setting = parser.getint(section, option)
99 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
105 def getConfigFloatOption(parser, section, option, default=None):
107 setting = parser.getfloat(section, option)
108 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
114 def getConfigBoolOption(parser, section, option, default=None):
116 setting = parser.getboolean(section, option)
117 except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
123 def getAllConfigSectionOptions(parser, section):
125 setting = parser.items(section)
126 except ConfigParser.NoSectionError:
132 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
133 fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
134 doMerge=True, keepPeak=False, returnTop=0):
136 """ returns a dictionary containing a list of merged overlapping regions by chromosome;
137 can optionally filter regions that have a scoreField fewer than minHits.
138 Can also optionally return the label of each region, as well as the
139 peak, if supplied (peakPos and peakHeight should be the last 2 fields).
140 Can return the top regions based on score if higher than minHits.
142 infile = open(regionfilename)
143 lines = infile.readlines()
144 regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
145 fullChrom, chromField, scoreField, pad, compact,
146 doMerge, keepPeak, returnTop)
153 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
154 fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
155 doMerge=True, keepPeak=False, returnTop=0):
156 """ returns a dictionary containing a list of merged overlapping regions by chromosome;
157 can optionally filter regions that have a scoreField fewer than minHits.
158 Can also optionally return the label of each region, as well as the
159 peak, if supplied (peakPos and peakHeight should be the last 2 fields).
160 Can return the top regions based on score if higher than minHits.
165 if 0 < returnTop < len(regionList):
167 for regionEntry in regionList:
168 if regionEntry[0] == "#":
169 if "pvalue" in regionEntry:
172 if "readShift" in regionEntry:
177 fields = regionEntry.strip().split("\t")
178 hits = float(fields[scoreField].strip())
182 returnTop = -1 * returnTop
183 minScore = scores[returnTop]
184 if minScore > minHits:
188 chromField = int(chromField)
190 #TODO: Current algorithm processes input file line by line and compares with prior lines. Problem is it
191 # exits at the first merge. This is not a problem when the input is sorted by start position, but in
192 # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
193 # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will
194 # be merged with A but that will exit the loop and the merge with C will be missed.
195 for regionEntry in regionList:
196 if regionEntry[0] == "#":
197 if "pvalue" in regionEntry:
200 if "readShift" in regionEntry:
205 fields = regionEntry.strip().split("\t")
208 hits = float(fields[scoreField].strip())
209 except (IndexError, ValueError):
216 (chrom, pos) = fields[chromField].split(":")
217 (front, back) = pos.split("-")
221 label = string.join(fields[:chromField],"\t")
222 chrom = fields[chromField]
223 start = int(fields[chromField + 1]) - pad
224 stop = int(fields[chromField + 2]) + pad
228 start = int(fields[2]) - pad
229 stop = int(fields[3]) + pad
235 peakPos = int(fields[-2 - hasPvalue - hasShift])
236 peakHeight = float(fields[-1 - hasPvalue - hasShift])
238 if chrom not in regions:
243 if doMerge and len(regions[chrom]) > 0:
244 for index in range(len(regions[chrom])):
245 region = regions[chrom][index]
246 rstart = region.start
248 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
256 rpeakPos = region.peakPos
257 rpeakHeight = region.peakHeight
258 if peakHeight > rpeakHeight:
259 rpeakHeight = peakHeight
262 regions[chrom][index].start = rstart
263 regions[chrom][index].stop = rstop
264 regions[chrom][index].length = abs(rstop - rstart)
266 regions[chrom][index].label = label
269 regions[chrom][index].peakPos = rpeakPos
270 regions[chrom][index].peakHeight = rpeakHeight
278 region = Region.Region(start, stop)
283 region.peakPos = peakPos
284 region.peakHeight = peakHeight
286 regions[chrom].append(region)
289 if verbose and (count % 100000 == 0):
293 for chrom in regions:
294 regionCount += len(regions[chrom])
295 regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
298 print "merged %d times" % mergeCount
299 print "returning %d regions" % regionCount
304 def regionsOverlap(start, stop, rstart, rstop):
306 (start, stop) = (stop, start)
309 (rstart, rstop) = (rstop, rstart)
311 return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
314 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
316 (start, stop) = (stop, start)
319 (rstart, rstop) = (rstop, rstart)
321 return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
324 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
325 shift=0, maxshift=75):
326 """ find the peak in a list of reads (hitlist) in a region
327 of a given length and absolute start point. returns a
328 list of peaks, the number of hits, a triangular-smoothed
329 version of hitlist, and the number of reads that are
330 forward (plus) sense.
331 If doWeight is True, weight the reads accordingly.
332 If leftPlus is True, return the number of plus reads left of
333 the peak, taken to be the first TopPos position.
337 shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
339 seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
341 # implementing a triangular smooth
342 smoothArray = array("f", [0.] * length)
343 for pos in range(2,length -2):
344 smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
346 topPos = getPeakPositionList(smoothArray, length)
347 peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
352 for read in regionArray:
354 weight = read["weight"]
358 currentPos = read["start"] - start
359 if currentPos <= maxPos and read["sense"] == "+":
360 numLeftPlus += weight
362 peak.numLeftPlus = numLeftPlus
367 def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
369 lowestScore = 20000000000
370 for testShift in xrange(maxShift + 1):
371 shiftArray = array("f", [0.] * length)
372 for read in readList:
373 currentpos = read["start"] - start
374 if read["sense"] == "+":
379 currentpos += testShift * senseModifier
380 if (currentpos < 1) or (currentpos >= length):
384 weight = read["weight"]
388 shiftArray[currentpos] += weight * senseModifier
391 for score in shiftArray:
392 currentScore += abs(score)
394 if currentScore < lowestScore:
395 bestShift = testShift
396 lowestScore = currentScore
401 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
402 seqArray = array("f", [0.] * length)
407 currentpos = read["start"] - start
408 if read["sense"] == "+":
413 if (currentpos < 1 - readlen) or (currentpos >= length):
417 weight = read["weight"]
423 regionArray.append(read)
426 while currentpos < 0:
430 while hitIndex < readlen and currentpos < length:
431 seqArray[currentpos] += weight
435 if read["sense"] == "+":
438 return seqArray, regionArray, numHits, numPlus
441 def getPeakPositionList(smoothArray, length):
444 for currentpos in xrange(length):
445 if topNucleotide < smoothArray[currentpos]:
446 topNucleotide = smoothArray[currentpos]
447 peakList = [currentpos]
448 elif topNucleotide == smoothArray[currentpos]:
449 peakList.append(currentpos)
454 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
455 restrictList=[], regionComplement=False, maxStop=250000000):
456 """ return a dictionary of cistematic gene features. Requires
457 cistematic, obviously. Can filter-out pseudogenes. Will use
458 additional regions dict to supplement gene models, if available.
459 Can restrict output to a list of GIDs.
460 If regionComplement is set to true, returns the regions *outside* of the
461 calculated boundaries, which is useful for retrieving intronic and
462 intergenic regions. maxStop is simply used to define the uppermost
463 boundary of the complement region.
465 featuresDict = genomeObject.getallGeneFeatures()
467 if len(restrictList) > 0:
470 if len(additionalRegionsDict) > 0:
472 for chrom in additionalRegionsDict:
473 for region in additionalRegionsDict[chrom]:
475 if label not in sortList:
476 sortList.append(label)
478 if label not in featuresDict:
479 featuresDict[label] = []
482 sense = featuresDict[label][0][-1]
484 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
487 featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
489 featuresByChromDict = {}
490 for gid in featuresDict:
491 if restrictGID and gid not in restrictList:
494 newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
495 if newFeatureList and chrom not in featuresByChromDict:
496 featuresByChromDict[chrom] = []
498 for (start, stop, ftype) in newFeatureList:
499 featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
501 for chrom in featuresByChromDict:
502 featuresByChromDict[chrom].sort()
505 complementByChromDict = {}
507 for chrom in featuresByChromDict:
508 complementByChromDict[chrom] = []
509 listLength = len(featuresByChromDict[chrom])
512 for index in range(listLength):
513 currentStop = featuresByChromDict[chrom][index][0]
515 if currentStart < currentStop:
516 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
518 currentStart = featuresByChromDict[chrom][index][1]
520 currentStop = maxStop
521 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
523 return (featuresByChromDict, complementByChromDict)
525 return featuresByChromDict
528 def getNewFeatureList(featureList, ignorePseudo=False):
530 for (featureType, chrom, start, stop, sense) in featureList:
531 if featureType == "PSEUDO" and ignorePseudo:
532 return [], chrom, sense
534 if (start, stop, featureType) not in newFeatureList:
537 for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
538 if start >= newFeatureStart and stop <= newFeatureStop:
541 if start < newFeatureStart and stop > newFeatureStop:
542 containedList.append((newFeatureStart, newFeatureStop))
544 if len(containedList) > 0:
547 for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
548 if (newFeatureStart, newFeatureStop) not in containedList:
549 newFList.append((newFeatureStart, newFeatureStop, featureType2))
550 if start >= newFeatureStart and stop <= newFeatureStop:
553 newFeatureList = newFList
556 newFeatureList.append((start, stop, featureType))
558 return newFeatureList, chrom, sense
561 def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
562 additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
563 lengthCDS=0, keepSense=False, adjustToNeighbor=True):
564 """ return a dictionary of gene loci. Can be used to retrieve additional
565 sequence upstream or downstream of gene, up to the next gene. Requires
566 cistematic, obviously.
567 Can filter-out pseudogenes and use additional regions outside of existing
568 gene models. Use upstreamSpanTSS to overlap half of the upstream region
570 If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
571 lengthCDS < 0bp, return only the last X bp from CDS.
573 locusByChromDict = {}
574 if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
575 return locusByChromDict
577 genomeName = genome.genome
578 featuresDict = getGeneFeatures(genome, additionalRegionsDict)
579 for gid in featuresDict:
580 featureList = featuresDict[gid]
582 for (ftype, chrom, start, stop, sense) in featureList:
583 newFeatureList.append((start, stop))
585 if ignorePseudo and ftype == "PSEUDO":
588 newFeatureList.sort()
590 sense = featureList[0][-1]
591 gstart = newFeatureList[0][0]
592 gstop = newFeatureList[-1][1]
593 glen = abs(gstart - gstop)
595 if not useCDS and upstream > 0:
597 if gstop > (gstart + upstream / 2):
598 gstop = gstart + upstream / 2
601 elif not useCDS and downstream > 0:
606 distance = upstream / 2
611 nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
612 if nextGene < distance * 2:
613 distance = nextGene / 2
615 distance = max(distance, 1)
619 distance = downstream
621 nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
622 if nextGene < downstream * 2:
623 distance = nextGene / 2
625 distance = max(distance, 1)
628 if 0 < lengthCDS < glen:
629 gstop = newFeatureList[0][0] + lengthCDS
631 if lengthCDS < 0 and abs(lengthCDS) < glen:
632 gstart = newFeatureList[-1][1] + lengthCDS
634 if not useCDS and upstream > 0:
636 if gstart < (gstop - upstream / 2):
637 gstart = gstop - upstream / 2
640 elif not useCDS and downstream > 0:
645 distance = upstream /2
650 nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
651 if nextGene < distance * 2:
652 distance = nextGene / 2
654 distance = max(distance, 1)
658 distance = downstream
660 nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
661 if nextGene < downstream * 2:
662 distance = nextGene / 2
664 distance = max(distance, 1)
667 if 0 < lengthCDS < glen:
668 gstart = newFeatureList[-1][-1] - lengthCDS
670 if lengthCDS < 0 and abs(lengthCDS) < glen:
671 gstop = newFeatureList[0][0] - lengthCDS
673 if chrom not in locusByChromDict:
674 locusByChromDict[chrom] = []
677 locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
679 locusByChromDict[chrom].append((gstart, gstop, gid, glen))
681 for chrom in locusByChromDict:
682 locusByChromDict[chrom].sort()
684 return locusByChromDict
687 def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
688 if upstream == 0 and downstream == 0 and not useCDS:
689 print "getLocusByChromDict: asked for no sequence - returning empty dict"
691 elif upstream > 0 and downstream > 0 and not useCDS:
692 print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
694 elif lengthCDS != 0 and not useCDS:
695 print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
697 elif upstreamSpanTSS and lengthCDS != 0:
698 print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
700 elif lengthCDS > 0 and downstream > 0:
701 print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
703 elif lengthCDS < 0 and upstream > 0:
704 print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
710 def getGeneFeatures(genome, additionalRegionsDict):
711 featuresDict = genome.getallGeneFeatures()
712 if len(additionalRegionsDict) > 0:
714 for chrom in additionalRegionsDict:
715 for region in additionalRegionsDict[chrom]:
717 if label not in sortList:
718 sortList.append(label)
720 if label not in featuresDict:
721 featuresDict[label] = []
724 sense = featuresDict[label][0][-1]
726 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
729 featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
734 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
735 normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
737 """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
738 a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
739 or a different weight / tag.
745 if defaultRegionFormat:
758 print "entering computeRegionBins"
759 if len(regionList) > 0:
760 for readID in regionList:
761 regionsBins[readID] = [0.] * bins
763 for chrom in regionsByChromDict:
764 for regionTuple in regionsByChromDict[chrom]:
765 regionID = regionTuple[regionIDField]
766 regionsBins[regionID] = [0.] * bins
768 for chrom in hitDict:
769 if chrom not in regionsByChromDict:
772 for regionTuple in regionsByChromDict[chrom]:
773 regionID = regionTuple[regionIDField]
774 regionsLen[regionID] = regionTuple[lengthField]
778 for read in hitDict[chrom]:
779 tagStart = read["start"]
780 weight = read["weight"]
782 if index % 100000 == 0:
783 print "read %d " % index,
785 stopPoint = tagStart + readlen
789 for regionTuple in regionsByChromDict[chrom][startRegion:]:
790 start = regionTuple[startField]
791 stop = regionTuple[stopField]
792 regionID = regionTuple[regionIDField]
793 rlen = regionTuple[lengthField]
795 rsense = regionTuple[senseField]
803 if start > stopPoint:
807 if start <= tagStart <= stop:
809 regionBinLength = rlen / bins
811 regionBinLength = binLength
814 positionInRegion = tagStart - start
816 positionInRegion = start + rlen - tagStart
818 # we are relying on python's integer division quirk
819 binID = positionInRegion / regionBinLength
820 if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
822 elif fixedFirstBin > 0:
829 regionsBins[regionID][binID] += normalizedTag * weight
831 print "%s %s" % (regionID, str(binID))
835 return (regionsBins, regionsLen)