first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / commoncode.py
1 import ConfigParser
2 import os
3 import string
4 from time import strftime
5 from array import array
6 from collections import defaultdict
7 import Peak
8 from cistematic.core.geneinfo import geneinfoDB
9 from cistematic.genomes import Genome
10 import Region
11
12 commoncodeVersion = 5.6
13
14 #TODO: This is a function dumping ground - needs to be reworked
15 class ErangeError(Exception):
16     pass
17
18
19 def getReverseComplement(base):
20     revComp = {"A": "T",
21                "T": "A",
22                "G": "C",
23                "C": "G",
24                "N": "N"
25         }
26
27     return revComp[base]
28
29
30 def countDuplicatesInList(listToCheck):
31     tally = defaultdict(int)
32     for item in listToCheck:
33         tally[item] += 1
34
35     return tally.items()
36
37
38 def writeLog(logFile, messenger, message):
39     """ create a log file to write a message from a messenger or append to an existing file.
40     """
41     try:
42         logfile = open(logFile)
43     except IOError:
44         logfile = open(logFile, "w")
45     else:
46         logfile = open(logFile, "a")
47
48     logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
49     logfile.close()
50
51
52 def getChromInfoList(chrominfo):
53     chromInfoList = []
54     linelist = open(chrominfo)
55     for line in linelist:
56         fields = line.strip().split('\t')
57         chr = fields[0]
58         start = 0
59         end = int(fields[1])
60         chromInfoList.append((chr,start,end))
61
62     linelist.close()
63
64     return chromInfoList
65
66
67 # BAM file support functions
68 def isSpliceEntry(cigarTupleList):
69     isSplice = False
70     if cigarTupleList is not None:
71         for operation,length in cigarTupleList:
72             if operation == 3:
73                 isSplice = True
74                 break
75
76     return isSplice
77
78
79 def addPairIDtoReadID(alignedread):
80     ID = alignedread.qname
81     if alignedread.is_read1:
82         ID = ID + '/1'
83
84     if alignedread.is_read2:
85         ID = ID + '/2'
86
87     return ID
88
89
90 def getHeaderComment(bamHeader, commentKey):
91     for comment in bamHeader["CO"]:
92         fields = comment.split("\t")
93         if fields[0] == commentKey:
94             return fields[1]
95
96     raise KeyError
97
98
99 def getReadSense(read):
100     if read.is_reverse:
101         sense = "-"
102     else:
103         sense = "+"
104
105     return sense
106
107 #TODO: is this where we need to take into account the NH > 10 restriction?
108 def countThisRead(read, countUniqs, countMulti, countSplices, sense):
109     countRead = True
110
111     if sense in ['-', '+'] and sense != getReadSense(read):
112         countRead = False
113     elif not countSplices and isSpliceEntry(read.cigar):
114         countRead = False
115     elif not countUniqs and read.opt('NH') == 1:
116         countRead = False
117     elif not countMulti and read.opt('NH') > 1:
118         countRead = False
119
120     return countRead
121
122
123 # Cistematic functions
124 def getGeneInfoDict(genome, cache=False):
125     idb = geneinfoDB(cache=cache)
126     if genome == "dmelanogaster":
127         geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
128     else:
129         geneinfoDict = idb.getallGeneInfo(genome)
130
131     return geneinfoDict
132
133
134 def getGeneAnnotDict(genome, inRAM=False):
135     return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
136
137
138 def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
139     genome = Genome(genomeName, inRAM=inRAM)
140     if extendGenome != "":
141         genome.extendFeatures(extendGenome, replace=replaceModels)
142
143     geneannotDict = genome.allAnnotInfo()
144
145     return geneannotDict
146
147
148 # Configuration File Support
149 def getConfigParser(fileList=[]):
150     configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
151     for filename in fileList:
152         configFiles.append(filename)
153
154     config = ConfigParser.SafeConfigParser()
155     config.read(configFiles)
156
157     return config
158
159
160 def getConfigOption(parser, section, option, default=None):
161     try:
162         setting = parser.get(section, option)
163     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
164         setting = default
165
166     return setting
167
168
169 def getConfigIntOption(parser, section, option, default=None):
170     try:
171         setting = parser.getint(section, option)
172     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
173         setting = default
174
175     return setting
176
177
178 def getConfigFloatOption(parser, section, option, default=None):
179     try:
180         setting = parser.getfloat(section, option)
181     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
182         setting = default
183
184     return setting
185
186
187 def getConfigBoolOption(parser, section, option, default=None):
188     try:
189         setting = parser.getboolean(section, option)
190     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
191         setting = default
192
193     return setting
194
195
196 def getAllConfigSectionOptions(parser, section):
197     try:
198         setting = parser.items(section)
199     except ConfigParser.NoSectionError:
200         setting = []
201
202     return setting
203
204
205 # All the Other Stuff from before
206 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
207                      fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
208                      doMerge=True, keepPeak=False, returnTop=0):
209
210     """ returns a dictionary containing a list of merged overlapping regions by chromosome; 
211     can optionally filter regions that have a scoreField fewer than minHits.
212     Can also optionally return the label of each region, as well as the
213     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
214     Can return the top regions based on score if higher than minHits.
215     """
216     infile = open(regionfilename)
217     lines = infile.readlines()
218     regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
219                                        fullChrom, chromField, scoreField, pad, compact,
220                                        doMerge, keepPeak, returnTop)
221
222     infile.close()
223
224     return regions
225
226
227 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
228                      fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
229                      doMerge=True, keepPeak=False, returnTop=0):
230     """ returns a dictionary containing a list of merged overlapping regions by chromosome; 
231     can optionally filter regions that have a scoreField fewer than minHits.
232     Can also optionally return the label of each region, as well as the
233     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
234     Can return the top regions based on score if higher than minHits.
235     """
236     regions = {}
237     hasPvalue = 0
238     hasShift = 0
239     if 0 < returnTop < len(regionList):
240         scores = []
241         for regionEntry in regionList:
242             if regionEntry[0] == "#":
243                 if "pvalue" in regionEntry:
244                     hasPvalue = 1
245
246                 if "readShift" in regionEntry:
247                     hasShift = 1
248
249                 continue
250
251             fields = regionEntry.strip().split("\t")
252             hits = float(fields[scoreField].strip())
253             scores.append(hits)
254
255         scores.sort()
256         returnTop = -1 * returnTop 
257         minScore = scores[returnTop]
258         if minScore > minHits:
259             minHits = minScore
260
261     mergeCount = 0
262     chromField = int(chromField)
263     count = 0
264     #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
265     #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
266     #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
267     #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
268     #      be merged with A but that will exit the loop and the merge with C will be missed.
269     #TODO: Are we going to require sorted region files to have this even work?
270     for regionEntry in regionList:
271         if regionEntry[0] == "#":
272             if "pvalue" in regionEntry:
273                 hasPvalue = 1
274
275             if "readShift" in regionEntry:
276                 hasShift = 1
277
278             continue
279
280         fields = regionEntry.strip().split("\t")
281         if minHits >= 0:
282             try:
283                 hits = float(fields[scoreField].strip())
284             except (IndexError, ValueError):
285                 continue
286
287             if hits < minHits:
288                 continue
289
290         if compact:
291             (chrom, pos) = fields[chromField].split(":")
292             (front, back) = pos.split("-")
293             start = int(front)
294             stop = int(back)
295         elif chromField > 1:
296             label = string.join(fields[:chromField],"\t")
297             chrom = fields[chromField]
298             start = int(fields[chromField + 1]) - pad
299             stop = int(fields[chromField + 2]) + pad
300         else:
301             label = fields[0]
302             chrom = fields[1]
303             start = int(fields[2]) - pad
304             stop = int(fields[3]) + pad
305
306         if not fullChrom:
307             chrom = chrom[3:]
308
309         if keepPeak:
310             peakPos = int(fields[-2 - hasPvalue - hasShift])
311             peakHeight = float(fields[-1 - hasPvalue - hasShift])
312
313         if chrom not in regions:
314             regions[chrom] = []
315
316         merged = False
317
318         if doMerge and len(regions[chrom]) > 0:
319             for index in range(len(regions[chrom])):
320                 region = regions[chrom][index]
321                 rstart = region.start
322                 rstop = region.stop
323                 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
324                     if start < rstart:
325                         rstart = start
326
327                     if rstop < stop:
328                         rstop = stop
329
330                     if keepPeak:
331                         rpeakPos = region.peakPos
332                         rpeakHeight = region.peakHeight
333                         if peakHeight > rpeakHeight:
334                             rpeakHeight = peakHeight
335                             rpeakPos = peakPos
336
337                     regions[chrom][index].start = rstart
338                     regions[chrom][index].stop = rstop
339                     regions[chrom][index].length = abs(rstop - rstart)
340                     if keepLabel:
341                         regions[chrom][index].label = label
342
343                     if keepPeak:
344                         regions[chrom][index].peakPos = rpeakPos
345                         regions[chrom][index].peakHeight = rpeakHeight
346
347
348                     mergeCount += 1
349                     merged = True
350                     break
351
352         if not merged:
353             region = Region.Region(start, stop)
354             if keepLabel:
355                 region.label = label
356
357             if keepPeak:
358                 region.peakPos = peakPos
359                 region.peakHeight = peakHeight
360
361             regions[chrom].append(region)
362             count += 1
363
364         if verbose and (count % 100000 == 0):
365             print count
366
367     regionCount = 0
368     for chrom in regions:
369         regionCount += len(regions[chrom])
370         regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
371
372     if verbose:
373         print "merged %d times" % mergeCount
374         print "returning %d regions" % regionCount
375
376     return regions
377
378
379 def regionsOverlap(start, stop, rstart, rstop):
380     if start > stop:
381         (start, stop) = (stop, start)
382
383     if rstart > rstop:
384         (rstart, rstop) = (rstop, rstart)
385
386     return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
387
388
389 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
390     if start > stop:
391         (start, stop) = (stop, start)
392
393     if rstart > rstop:
394         (rstart, rstop) = (rstop, rstart)
395
396     return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
397
398
399 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
400              shift=0, maxshift=75):
401     """ find the peak in a list of reads (hitlist) in a region
402     of a given length and absolute start point. returns a
403     list of peaks, the number of hits, a triangular-smoothed
404     version of hitlist, and the number of reads that are
405     forward (plus) sense.
406     If doWeight is True, weight the reads accordingly.
407     If leftPlus is True, return the number of plus reads left of
408     the peak, taken to be the first TopPos position.
409     """
410
411     if shift == "auto":
412         shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
413
414     seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
415
416     # implementing a triangular smooth
417     smoothArray = array("f", [0.] * length)
418     for pos in range(2,length -2):
419         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
420
421     topPos = getPeakPositionList(smoothArray, length)
422     peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
423
424     if leftPlus:
425         numLeftPlus = 0
426         maxPos = topPos[0]
427         for read in regionArray:
428             if doWeight:
429                 weight = read["weight"]
430             else:
431                 weight = 1.0
432
433             currentPos = read["start"] - start
434             if currentPos <= maxPos and read["sense"] == "+":
435                 numLeftPlus += weight
436
437         peak.numLeftPlus = numLeftPlus
438
439     return peak
440
441
442 def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
443     bestShift = 0
444     lowestScore = 20000000000
445     for testShift in xrange(maxShift + 1):
446         shiftArray = array("f", [0.] * length)
447         for read in readList:
448             currentpos = read["start"] - start
449             if read["sense"] == "+":
450                 senseModifier = 1
451             else:
452                 senseModifier = -1
453
454             currentpos += testShift * senseModifier
455             if (currentpos < 1) or (currentpos >= length):
456                 continue
457
458             if useWeight:
459                 weight = read["weight"]
460             else:
461                 weight = 1.0
462
463             shiftArray[currentpos] += weight * senseModifier
464
465         currentScore = 0
466         for score in shiftArray:
467             currentScore += abs(score)
468
469         if currentScore < lowestScore:
470             bestShift = testShift
471             lowestScore = currentScore
472
473     return bestShift
474
475
476 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
477     seqArray = array("f", [0.] * length)
478     numHits = 0.
479     numPlus = 0.
480     regionArray = []
481     for read in hitList:
482         currentpos = read["start"] - start
483         if read["sense"] == "+":
484             currentpos += shift
485         else:
486             currentpos -= shift
487
488         if (currentpos <  1 - readlen) or (currentpos >= length):
489             continue
490
491         if doWeight:
492             weight = read["weight"]
493         else:
494             weight = 1.0
495
496         numHits += weight
497         if leftPlus:
498             regionArray.append(read)
499
500         hitIndex = 0
501         while currentpos < 0:
502             hitIndex += 1
503             currentpos += 1
504
505         while hitIndex < readlen and currentpos < length:
506             seqArray[currentpos] += weight
507             hitIndex += 1
508             currentpos += 1
509
510         if read["sense"] == "+":
511             numPlus += weight
512
513     return seqArray, regionArray, numHits, numPlus
514
515
516 def getPeakPositionList(smoothArray, length):
517     topNucleotide = 0
518     peakList = []
519     for currentpos in xrange(length):
520         if topNucleotide < smoothArray[currentpos]:
521             topNucleotide = smoothArray[currentpos]
522             peakList = [currentpos]
523         elif topNucleotide == smoothArray[currentpos]:
524             peakList.append(currentpos)
525
526     return peakList
527
528
529 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
530                            restrictList=[], regionComplement=False, maxStop=250000000):
531     """ return a dictionary of cistematic gene features. Requires
532     cistematic, obviously. Can filter-out pseudogenes. Will use
533     additional regions dict to supplement gene models, if available.
534     Can restrict output to a list of GIDs.
535     If regionComplement is set to true, returns the regions *outside* of the
536     calculated boundaries, which is useful for retrieving intronic and
537     intergenic regions. maxStop is simply used to define the uppermost
538     boundary of the complement region.
539     """ 
540     featuresDict = genomeObject.getallGeneFeatures()
541     restrictGID = False
542     if len(restrictList) > 0:
543         restrictGID = True
544
545     if len(additionalRegionsDict) > 0:
546         sortList = []
547         for chrom in additionalRegionsDict:
548             for region in additionalRegionsDict[chrom]:
549                 label = region.label
550                 if label not in sortList:
551                     sortList.append(label)
552
553                 if label not in featuresDict:
554                     featuresDict[label] = []
555                     sense = "+"
556                 else:
557                     sense = featuresDict[label][0][-1]
558
559                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
560
561         for gid in sortList:
562             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
563
564     featuresByChromDict = {}
565     for gid in featuresDict:
566         if restrictGID and gid not in restrictList:
567             continue
568
569         newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
570         if newFeatureList and chrom not in featuresByChromDict:
571             featuresByChromDict[chrom] = []
572
573         for (start, stop, ftype) in newFeatureList:
574             featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
575
576     for chrom in featuresByChromDict:
577         featuresByChromDict[chrom].sort()
578
579     if regionComplement:
580         complementByChromDict = {}
581         complementIndex = 0
582         for chrom in featuresByChromDict:
583             complementByChromDict[chrom] = []
584             listLength = len(featuresByChromDict[chrom])
585             if listLength > 0:
586                 currentStart = 0
587                 for index in range(listLength):
588                     currentStop = featuresByChromDict[chrom][index][0]
589                     complementIndex += 1
590                     if currentStart < currentStop:
591                         complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
592
593                     currentStart = featuresByChromDict[chrom][index][1]
594
595                 currentStop = maxStop
596                 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
597
598         return (featuresByChromDict, complementByChromDict)
599     else:
600         return featuresByChromDict
601
602
603 def getNewFeatureList(featureList, ignorePseudo=False):
604     newFeatureList = []
605     for (featureType, chrom, start, stop, sense) in featureList:
606         if featureType == "PSEUDO" and ignorePseudo:
607             return [], chrom, sense
608
609         if (start, stop, featureType) not in newFeatureList:
610             notContained = True
611             containedList = []
612             for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
613                 if start >= newFeatureStart and stop <= newFeatureStop:
614                     notContained = False
615
616                 if start < newFeatureStart and stop > newFeatureStop:
617                     containedList.append((newFeatureStart, newFeatureStop))
618
619             if len(containedList) > 0:
620                 newFList = []
621                 notContained = True
622                 for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
623                     if (newFeatureStart, newFeatureStop) not in containedList:
624                         newFList.append((newFeatureStart, newFeatureStop, featureType2))
625                         if start >= newFeatureStart and stop <= newFeatureStop:
626                             notContained = False
627
628                 newFeatureList = newFList
629
630             if notContained:
631                 newFeatureList.append((start, stop, featureType))
632
633     return newFeatureList, chrom, sense
634
635
636 def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
637                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
638                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
639     """ return a dictionary of gene loci. Can be used to retrieve additional
640     sequence upstream or downstream of gene, up to the next gene. Requires
641     cistematic, obviously.
642     Can filter-out pseudogenes and use additional regions outside of existing
643     gene models. Use upstreamSpanTSS to overlap half of the upstream region
644     over the TSS.
645     If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
646     lengthCDS < 0bp, return only the last X bp from CDS.
647     """ 
648     locusByChromDict = {}
649     if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
650         return locusByChromDict
651
652     genomeName = genome.genome
653     featuresDict = getGeneFeatures(genome, additionalRegionsDict)
654     for gid in featuresDict:
655         featureList = featuresDict[gid]
656         newFeatureList = []
657         for (ftype, chrom, start, stop, sense) in featureList:
658             newFeatureList.append((start, stop))
659
660         if ignorePseudo and ftype == "PSEUDO":
661             continue
662
663         newFeatureList.sort()
664
665         sense = featureList[0][-1]
666         gstart = newFeatureList[0][0]
667         gstop = newFeatureList[-1][1]
668         glen = abs(gstart - gstop)
669         if sense == "F":
670             if not useCDS and upstream > 0:
671                 if upstreamSpanTSS:
672                     if gstop > (gstart + upstream / 2):
673                         gstop = gstart + upstream / 2
674                 else:
675                     gstop = gstart
676             elif not useCDS and downstream > 0:
677                 gstart = gstop
678
679             if upstream > 0:
680                 if upstreamSpanTSS:
681                     distance = upstream / 2
682                 else:
683                     distance = upstream
684
685                 if adjustToNeighbor:
686                     nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
687                     if nextGene < distance * 2:
688                         distance = nextGene / 2
689
690                 distance = max(distance, 1)
691                 gstart -= distance
692
693             if downstream > 0:
694                 distance = downstream
695                 if adjustToNeighbor:
696                     nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
697                     if nextGene < downstream * 2:
698                         distance = nextGene / 2
699
700                 distance = max(distance, 1)
701                 gstop += distance
702
703             if 0 < lengthCDS < glen:
704                 gstop = newFeatureList[0][0] + lengthCDS
705
706             if lengthCDS < 0 and abs(lengthCDS) < glen:
707                 gstart = newFeatureList[-1][1] + lengthCDS
708         else:
709             if not useCDS and upstream > 0:
710                 if upstreamSpanTSS:
711                     if gstart < (gstop - upstream / 2):
712                         gstart = gstop - upstream / 2
713                 else:
714                     gstart = gstop
715             elif not useCDS and downstream > 0:
716                     gstop = gstart
717
718             if upstream > 0:
719                 if upstreamSpanTSS:
720                     distance = upstream /2
721                 else:
722                     distance = upstream
723
724                 if adjustToNeighbor:
725                     nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
726                     if nextGene < distance * 2:
727                         distance = nextGene / 2
728
729                 distance = max(distance, 1)
730                 gstop += distance
731
732             if downstream > 0:
733                 distance = downstream
734                 if adjustToNeighbor:
735                     nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
736                     if nextGene < downstream * 2:
737                         distance = nextGene / 2
738
739                 distance = max(distance, 1)
740                 gstart -= distance
741
742             if 0 < lengthCDS < glen:
743                 gstart = newFeatureList[-1][-1] - lengthCDS
744
745             if lengthCDS < 0 and abs(lengthCDS) < glen:
746                 gstop = newFeatureList[0][0] - lengthCDS
747
748         if chrom not in locusByChromDict:
749             locusByChromDict[chrom] = []
750
751         if keepSense:
752             locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
753         else:
754             locusByChromDict[chrom].append((gstart, gstop, gid, glen))
755
756     for chrom in locusByChromDict:
757         locusByChromDict[chrom].sort()
758
759     return locusByChromDict
760
761
762 def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
763     if upstream == 0 and downstream == 0 and not useCDS:
764         print "getLocusByChromDict: asked for no sequence - returning empty dict"
765         return True
766     elif upstream > 0 and downstream > 0 and not useCDS:
767         print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
768         return True
769     elif lengthCDS != 0 and not useCDS:
770         print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
771         return True
772     elif upstreamSpanTSS and lengthCDS != 0:
773         print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
774         return True
775     elif lengthCDS > 0 and downstream > 0:
776         print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
777         return True
778     elif lengthCDS < 0 and upstream > 0:
779         print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
780         return True
781
782     return False
783
784
785 def getGeneFeatures(genome, additionalRegionsDict):
786     featuresDict = genome.getallGeneFeatures()
787     if len(additionalRegionsDict) > 0:
788         sortList = []
789         for chrom in additionalRegionsDict:
790             for region in additionalRegionsDict[chrom]:
791                 label = region.label
792                 if label not in sortList:
793                     sortList.append(label)
794
795                 if label not in featuresDict:
796                     featuresDict[label] = []
797                     sense = "+"
798                 else:
799                     sense = featuresDict[label][0][-1]
800
801                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
802
803         for gid in sortList:
804             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
805
806     return featuresDict
807
808
809 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
810                       normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
811                       binLength=-1):
812     """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
813         a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
814         or a different weight / tag.
815     """
816     index = 0
817     regionsBins = {}
818     regionsLen = {}
819
820     if defaultRegionFormat:
821         regionIDField = 0
822         startField = 1
823         stopField = 2
824         lengthField = 3
825     else:
826         startField = 0
827         stopField = 1
828         regionIDField = 2
829         lengthField = 3
830
831     senseField = 4
832
833     print "entering computeRegionBins"
834     if len(regionList) > 0:
835         for readID in regionList:
836             regionsBins[readID] = [0.] * bins
837     else:
838         for chrom in regionsByChromDict:
839             for regionTuple in regionsByChromDict[chrom]:
840                 regionID = regionTuple[regionIDField]
841                 regionsBins[regionID] = [0.] * bins
842
843     for chrom in hitDict:
844         if chrom not in regionsByChromDict:
845             continue
846
847         for regionTuple in regionsByChromDict[chrom]:
848             regionID = regionTuple[regionIDField]
849             regionsLen[regionID] = regionTuple[lengthField]
850
851         print "%s\n" % chrom
852         startRegion = 0
853         for read in hitDict[chrom]:
854             tagStart = read["start"]
855             weight = read["weight"]
856             index += 1
857             if index % 100000 == 0:
858                 print "read %d " % index,
859
860             stopPoint = tagStart + readlen
861             if startRegion < 0:
862                 startRegion = 0
863
864             for regionTuple in regionsByChromDict[chrom][startRegion:]:
865                 start = regionTuple[startField]
866                 stop = regionTuple[stopField]
867                 regionID = regionTuple[regionIDField]
868                 rlen = regionTuple[lengthField]
869                 try:
870                     rsense = regionTuple[senseField]
871                 except IndexError:
872                     rsense = "F"
873
874                 if tagStart > stop:
875                     startRegion += 1
876                     continue
877
878                 if start > stopPoint:
879                     startRegion -= 10
880                     break
881
882                 if start <= tagStart <= stop:
883                     if binLength < 1:
884                         regionBinLength = rlen / bins
885                     else:
886                         regionBinLength = binLength
887
888                     if rsense == "F":
889                         positionInRegion = tagStart - start
890                     else:
891                         positionInRegion = start + rlen - tagStart
892
893                     # we are relying on python's integer division quirk
894                     binID = positionInRegion / regionBinLength
895                     if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
896                         binID = 0
897                     elif fixedFirstBin > 0:
898                         binID = 1
899
900                     if binID >= bins:
901                         binID = bins - 1
902
903                     try:
904                         regionsBins[regionID][binID] += normalizedTag * weight
905                     except KeyError:
906                         print "%s %s" % (regionID, str(binID))
907
908                     stopPoint = stop
909
910     return (regionsBins, regionsLen)