rewrite of findall.py and MakeRdsFromBam to fix bugs resulting from poor initial...
[erange.git] / commoncode.py
1 import ConfigParser
2 import os
3 import string
4 from time import strftime
5 from array import array
6 from collections import defaultdict
7 import Peak
8 from cistematic.core.geneinfo import geneinfoDB
9 from cistematic.genomes import Genome
10 import Region
11
12 commoncodeVersion = 5.6
13 currentRDSversion = 2.0
14
15 class ErangeError(Exception):
16     pass
17
18
19 def getReverseComplement(base):
20     revComp = {"A": "T",
21                "T": "A",
22                "G": "C",
23                "C": "G",
24                "N": "N"
25         }
26
27     return revComp[base]
28
29
30 def countDuplicatesInList(listToCheck):
31     tally = defaultdict(int)
32     for item in listToCheck:
33         tally[item] += 1
34
35     return tally.items()
36
37
38 def writeLog(logFile, messenger, message):
39     """ create a log file to write a message from a messenger or append to an existing file.
40     """
41     try:
42         logfile = open(logFile)
43     except IOError:
44         logfile = open(logFile, "w")
45     else:
46         logfile = open(logFile, "a")
47
48     logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
49     logfile.close()
50
51
52 def getGeneInfoDict(genome, cache=False):
53     idb = geneinfoDB(cache=cache)
54     if genome == "dmelanogaster":
55         geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
56     else:
57         geneinfoDict = idb.getallGeneInfo(genome)
58
59     return geneinfoDict
60
61
62 def getGeneAnnotDict(genome, inRAM=False):
63     return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
64
65
66 def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
67     genome = Genome(genomeName, inRAM=inRAM)
68     if extendGenome != "":
69         genome.extendFeatures(extendGenome, replace=replaceModels)
70
71     geneannotDict = genome.allAnnotInfo()
72
73     return geneannotDict
74
75
76 def getConfigParser(fileList=[]):
77     configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
78     for filename in fileList:
79         configFiles.append(filename)
80
81     config = ConfigParser.SafeConfigParser()
82     config.read(configFiles)
83
84     return config
85
86
87 def getConfigOption(parser, section, option, default=None):
88     try:
89         setting = parser.get(section, option)
90     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
91         setting = default
92
93     return setting
94
95
96 def getConfigIntOption(parser, section, option, default=None):
97     try:
98         setting = parser.getint(section, option)
99     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
100         setting = default
101
102     return setting
103
104
105 def getConfigFloatOption(parser, section, option, default=None):
106     try:
107         setting = parser.getfloat(section, option)
108     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
109         setting = default
110
111     return setting
112
113
114 def getConfigBoolOption(parser, section, option, default=None):
115     try:
116         setting = parser.getboolean(section, option)
117     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
118         setting = default
119
120     return setting
121
122
123 def getAllConfigSectionOptions(parser, section):
124     try:
125         setting = parser.items(section)
126     except ConfigParser.NoSectionError:
127         setting = []
128
129     return setting
130
131
132 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
133                      fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
134                      doMerge=True, keepPeak=False, returnTop=0):
135
136     """ returns a dictionary containing a list of merged overlapping regions by chromosome; 
137     can optionally filter regions that have a scoreField fewer than minHits.
138     Can also optionally return the label of each region, as well as the
139     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
140     Can return the top regions based on score if higher than minHits.
141     """
142     infile = open(regionfilename)
143     lines = infile.readlines()
144     regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
145                                        fullChrom, chromField, scoreField, pad, compact,
146                                        doMerge, keepPeak, returnTop)
147
148     infile.close()
149
150     return regions
151
152
153 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
154                      fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
155                      doMerge=True, keepPeak=False, returnTop=0):
156     """ returns a dictionary containing a list of merged overlapping regions by chromosome; 
157     can optionally filter regions that have a scoreField fewer than minHits.
158     Can also optionally return the label of each region, as well as the
159     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
160     Can return the top regions based on score if higher than minHits.
161     """
162     regions = {}
163     hasPvalue = 0
164     hasShift = 0
165     if 0 < returnTop < len(regionList):
166         scores = []
167         for regionEntry in regionList:
168             if regionEntry[0] == "#":
169                 if "pvalue" in regionEntry:
170                     hasPvalue = 1
171
172                 if "readShift" in regionEntry:
173                     hasShift = 1
174
175                 continue
176
177             fields = regionEntry.strip().split("\t")
178             hits = float(fields[scoreField].strip())
179             scores.append(hits)
180
181         scores.sort()
182         returnTop = -1 * returnTop 
183         minScore = scores[returnTop]
184         if minScore > minHits:
185             minHits = minScore
186
187     mergeCount = 0
188     chromField = int(chromField)
189     count = 0
190     #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
191     #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
192     #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
193     #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
194     #      be merged with A but that will exit the loop and the merge with C will be missed.
195     for regionEntry in regionList:
196         if regionEntry[0] == "#":
197             if "pvalue" in regionEntry:
198                 hasPvalue = 1
199
200             if "readShift" in regionEntry:
201                 hasShift = 1
202
203             continue
204
205         fields = regionEntry.strip().split("\t")
206         if minHits >= 0:
207             try:
208                 hits = float(fields[scoreField].strip())
209             except (IndexError, ValueError):
210                 continue
211
212             if hits < minHits:
213                 continue
214
215         if compact:
216             (chrom, pos) = fields[chromField].split(":")
217             (front, back) = pos.split("-")
218             start = int(front)
219             stop = int(back)
220         elif chromField > 1:
221             label = string.join(fields[:chromField],"\t")
222             chrom = fields[chromField]
223             start = int(fields[chromField + 1]) - pad
224             stop = int(fields[chromField + 2]) + pad
225         else:
226             label = fields[0]
227             chrom = fields[1]
228             start = int(fields[2]) - pad
229             stop = int(fields[3]) + pad
230
231         if not fullChrom:
232             chrom = chrom[3:]
233
234         if keepPeak:
235             peakPos = int(fields[-2 - hasPvalue - hasShift])
236             peakHeight = float(fields[-1 - hasPvalue - hasShift])
237
238         if chrom not in regions:
239             regions[chrom] = []
240
241         merged = False
242
243         if doMerge and len(regions[chrom]) > 0:
244             for index in range(len(regions[chrom])):
245                 region = regions[chrom][index]
246                 rstart = region.start
247                 rstop = region.stop
248                 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
249                     if start < rstart:
250                         rstart = start
251
252                     if rstop < stop:
253                         rstop = stop
254
255                     if keepPeak:
256                         rpeakPos = region.peakPos
257                         rpeakHeight = region.peakHeight
258                         if peakHeight > rpeakHeight:
259                             rpeakHeight = peakHeight
260                             rpeakPos = peakPos
261
262                     regions[chrom][index].start = rstart
263                     regions[chrom][index].stop = rstop
264                     regions[chrom][index].length = abs(rstop - rstart)
265                     if keepLabel:
266                         regions[chrom][index].label = label
267
268                     if keepPeak:
269                         regions[chrom][index].peakPos = rpeakPos
270                         regions[chrom][index].peakHeight = rpeakHeight
271
272
273                     mergeCount += 1
274                     merged = True
275                     break
276
277         if not merged:
278             region = Region.Region(start, stop)
279             if keepLabel:
280                 region.label = label
281
282             if keepPeak:
283                 region.peakPos = peakPos
284                 region.peakHeight = peakHeight
285
286             regions[chrom].append(region)
287             count += 1
288
289         if verbose and (count % 100000 == 0):
290             print count
291
292     regionCount = 0
293     for chrom in regions:
294         regionCount += len(regions[chrom])
295         regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
296
297     if verbose:
298         print "merged %d times" % mergeCount
299         print "returning %d regions" % regionCount
300
301     return regions
302
303
304 def regionsOverlap(start, stop, rstart, rstop):
305     if start > stop:
306         (start, stop) = (stop, start)
307
308     if rstart > rstop:
309         (rstart, rstop) = (rstop, rstart)
310
311     return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
312
313
314 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
315     if start > stop:
316         (start, stop) = (stop, start)
317
318     if rstart > rstop:
319         (rstart, rstop) = (rstop, rstart)
320
321     return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
322
323
324 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
325              shift=0, maxshift=75):
326     """ find the peak in a list of reads (hitlist) in a region
327     of a given length and absolute start point. returns a
328     list of peaks, the number of hits, a triangular-smoothed
329     version of hitlist, and the number of reads that are
330     forward (plus) sense.
331     If doWeight is True, weight the reads accordingly.
332     If leftPlus is True, return the number of plus reads left of
333     the peak, taken to be the first TopPos position.
334     """
335
336     if shift == "auto":
337         shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
338
339     seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
340
341     # implementing a triangular smooth
342     smoothArray = array("f", [0.] * length)
343     for pos in range(2,length -2):
344         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
345
346     topPos = getPeakPositionList(smoothArray, length)
347     peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
348
349     if leftPlus:
350         numLeftPlus = 0
351         maxPos = topPos[0]
352         for read in regionArray:
353             if doWeight:
354                 weight = read["weight"]
355             else:
356                 weight = 1.0
357
358             currentPos = read["start"] - start
359             if currentPos <= maxPos and read["sense"] == "+":
360                 numLeftPlus += weight
361
362         peak.numLeftPlus = numLeftPlus
363
364     return peak
365
366
367 def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
368     bestShift = 0
369     lowestScore = 20000000000
370     for testShift in xrange(maxShift + 1):
371         shiftArray = array("f", [0.] * length)
372         for read in readList:
373             currentpos = read["start"] - start
374             if read["sense"] == "+":
375                 senseModifier = 1
376             else:
377                 senseModifier = -1
378
379             currentpos += testShift * senseModifier
380             if (currentpos < 1) or (currentpos >= length):
381                 continue
382
383             if useWeight:
384                 weight = read["weight"]
385             else:
386                 weight = 1.0
387
388             shiftArray[currentpos] += weight * senseModifier
389
390         currentScore = 0
391         for score in shiftArray:
392             currentScore += abs(score)
393
394         if currentScore < lowestScore:
395             bestShift = testShift
396             lowestScore = currentScore
397
398     return bestShift
399
400
401 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
402     seqArray = array("f", [0.] * length)
403     numHits = 0.
404     numPlus = 0.
405     regionArray = []
406     for read in hitList:
407         currentpos = read["start"] - start
408         if read["sense"] == "+":
409             currentpos += shift
410         else:
411             currentpos -= shift
412
413         if (currentpos <  1 - readlen) or (currentpos >= length):
414             continue
415
416         if doWeight:
417             weight = read["weight"]
418         else:
419             weight = 1.0
420
421         numHits += weight
422         if leftPlus:
423             regionArray.append(read)
424
425         hitIndex = 0
426         while currentpos < 0:
427             hitIndex += 1
428             currentpos += 1
429
430         while hitIndex < readlen and currentpos < length:
431             seqArray[currentpos] += weight
432             hitIndex += 1
433             currentpos += 1
434
435         if read["sense"] == "+":
436             numPlus += weight
437
438     return seqArray, regionArray, numHits, numPlus
439
440
441 def getPeakPositionList(smoothArray, length):
442     topNucleotide = 0
443     peakList = []
444     for currentpos in xrange(length):
445         if topNucleotide < smoothArray[currentpos]:
446             topNucleotide = smoothArray[currentpos]
447             peakList = [currentpos]
448         elif topNucleotide == smoothArray[currentpos]:
449             peakList.append(currentpos)
450
451     return peakList
452
453
454 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
455                            restrictList=[], regionComplement=False, maxStop=250000000):
456     """ return a dictionary of cistematic gene features. Requires
457     cistematic, obviously. Can filter-out pseudogenes. Will use
458     additional regions dict to supplement gene models, if available.
459     Can restrict output to a list of GIDs.
460     If regionComplement is set to true, returns the regions *outside* of the
461     calculated boundaries, which is useful for retrieving intronic and
462     intergenic regions. maxStop is simply used to define the uppermost
463     boundary of the complement region.
464     """ 
465     featuresDict = genomeObject.getallGeneFeatures()
466     restrictGID = False
467     if len(restrictList) > 0:
468         restrictGID = True
469
470     if len(additionalRegionsDict) > 0:
471         sortList = []
472         for chrom in additionalRegionsDict:
473             for region in additionalRegionsDict[chrom]:
474                 label = region.label
475                 if label not in sortList:
476                     sortList.append(label)
477
478                 if label not in featuresDict:
479                     featuresDict[label] = []
480                     sense = "+"
481                 else:
482                     sense = featuresDict[label][0][-1]
483
484                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
485
486         for gid in sortList:
487             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
488
489     featuresByChromDict = {}
490     for gid in featuresDict:
491         if restrictGID and gid not in restrictList:
492             continue
493
494         newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
495         if newFeatureList and chrom not in featuresByChromDict:
496             featuresByChromDict[chrom] = []
497
498         for (start, stop, ftype) in newFeatureList:
499             featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
500
501     for chrom in featuresByChromDict:
502         featuresByChromDict[chrom].sort()
503
504     if regionComplement:
505         complementByChromDict = {}
506         complementIndex = 0
507         for chrom in featuresByChromDict:
508             complementByChromDict[chrom] = []
509             listLength = len(featuresByChromDict[chrom])
510             if listLength > 0:
511                 currentStart = 0
512                 for index in range(listLength):
513                     currentStop = featuresByChromDict[chrom][index][0]
514                     complementIndex += 1
515                     if currentStart < currentStop:
516                         complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
517
518                     currentStart = featuresByChromDict[chrom][index][1]
519
520                 currentStop = maxStop
521                 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
522
523         return (featuresByChromDict, complementByChromDict)
524     else:
525         return featuresByChromDict
526
527
528 def getNewFeatureList(featureList, ignorePseudo=False):
529     newFeatureList = []
530     for (featureType, chrom, start, stop, sense) in featureList:
531         if featureType == "PSEUDO" and ignorePseudo:
532             return [], chrom, sense
533
534         if (start, stop, featureType) not in newFeatureList:
535             notContained = True
536             containedList = []
537             for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
538                 if start >= newFeatureStart and stop <= newFeatureStop:
539                     notContained = False
540
541                 if start < newFeatureStart and stop > newFeatureStop:
542                     containedList.append((newFeatureStart, newFeatureStop))
543
544             if len(containedList) > 0:
545                 newFList = []
546                 notContained = True
547                 for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
548                     if (newFeatureStart, newFeatureStop) not in containedList:
549                         newFList.append((newFeatureStart, newFeatureStop, featureType2))
550                         if start >= newFeatureStart and stop <= newFeatureStop:
551                             notContained = False
552
553                 newFeatureList = newFList
554
555             if notContained:
556                 newFeatureList.append((start, stop, featureType))
557
558     return newFeatureList, chrom, sense
559
560
561 def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
562                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
563                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
564     """ return a dictionary of gene loci. Can be used to retrieve additional
565     sequence upstream or downstream of gene, up to the next gene. Requires
566     cistematic, obviously.
567     Can filter-out pseudogenes and use additional regions outside of existing
568     gene models. Use upstreamSpanTSS to overlap half of the upstream region
569     over the TSS.
570     If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
571     lengthCDS < 0bp, return only the last X bp from CDS.
572     """ 
573     locusByChromDict = {}
574     if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
575         return locusByChromDict
576
577     genomeName = genome.genome
578     featuresDict = getGeneFeatures(genome, additionalRegionsDict)
579     for gid in featuresDict:
580         featureList = featuresDict[gid]
581         newFeatureList = []
582         for (ftype, chrom, start, stop, sense) in featureList:
583             newFeatureList.append((start, stop))
584
585         if ignorePseudo and ftype == "PSEUDO":
586             continue
587
588         newFeatureList.sort()
589
590         sense = featureList[0][-1]
591         gstart = newFeatureList[0][0]
592         gstop = newFeatureList[-1][1]
593         glen = abs(gstart - gstop)
594         if sense == "F":
595             if not useCDS and upstream > 0:
596                 if upstreamSpanTSS:
597                     if gstop > (gstart + upstream / 2):
598                         gstop = gstart + upstream / 2
599                 else:
600                     gstop = gstart
601             elif not useCDS and downstream > 0:
602                 gstart = gstop
603
604             if upstream > 0:
605                 if upstreamSpanTSS:
606                     distance = upstream / 2
607                 else:
608                     distance = upstream
609
610                 if adjustToNeighbor:
611                     nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
612                     if nextGene < distance * 2:
613                         distance = nextGene / 2
614
615                 distance = max(distance, 1)
616                 gstart -= distance
617
618             if downstream > 0:
619                 distance = downstream
620                 if adjustToNeighbor:
621                     nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
622                     if nextGene < downstream * 2:
623                         distance = nextGene / 2
624
625                 distance = max(distance, 1)
626                 gstop += distance
627
628             if 0 < lengthCDS < glen:
629                 gstop = newFeatureList[0][0] + lengthCDS
630
631             if lengthCDS < 0 and abs(lengthCDS) < glen:
632                 gstart = newFeatureList[-1][1] + lengthCDS
633         else:
634             if not useCDS and upstream > 0:
635                 if upstreamSpanTSS:
636                     if gstart < (gstop - upstream / 2):
637                         gstart = gstop - upstream / 2
638                 else:
639                     gstart = gstop
640             elif not useCDS and downstream > 0:
641                     gstop = gstart
642
643             if upstream > 0:
644                 if upstreamSpanTSS:
645                     distance = upstream /2
646                 else:
647                     distance = upstream
648
649                 if adjustToNeighbor:
650                     nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
651                     if nextGene < distance * 2:
652                         distance = nextGene / 2
653
654                 distance = max(distance, 1)
655                 gstop += distance
656
657             if downstream > 0:
658                 distance = downstream
659                 if adjustToNeighbor:
660                     nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
661                     if nextGene < downstream * 2:
662                         distance = nextGene / 2
663
664                 distance = max(distance, 1)
665                 gstart -= distance
666
667             if 0 < lengthCDS < glen:
668                 gstart = newFeatureList[-1][-1] - lengthCDS
669
670             if lengthCDS < 0 and abs(lengthCDS) < glen:
671                 gstop = newFeatureList[0][0] - lengthCDS
672
673         if chrom not in locusByChromDict:
674             locusByChromDict[chrom] = []
675
676         if keepSense:
677             locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
678         else:
679             locusByChromDict[chrom].append((gstart, gstop, gid, glen))
680
681     for chrom in locusByChromDict:
682         locusByChromDict[chrom].sort()
683
684     return locusByChromDict
685
686
687 def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
688     if upstream == 0 and downstream == 0 and not useCDS:
689         print "getLocusByChromDict: asked for no sequence - returning empty dict"
690         return True
691     elif upstream > 0 and downstream > 0 and not useCDS:
692         print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
693         return True
694     elif lengthCDS != 0 and not useCDS:
695         print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
696         return True
697     elif upstreamSpanTSS and lengthCDS != 0:
698         print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
699         return True
700     elif lengthCDS > 0 and downstream > 0:
701         print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
702         return True
703     elif lengthCDS < 0 and upstream > 0:
704         print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
705         return True
706
707     return False
708
709
710 def getGeneFeatures(genome, additionalRegionsDict):
711     featuresDict = genome.getallGeneFeatures()
712     if len(additionalRegionsDict) > 0:
713         sortList = []
714         for chrom in additionalRegionsDict:
715             for region in additionalRegionsDict[chrom]:
716                 label = region.label
717                 if label not in sortList:
718                     sortList.append(label)
719
720                 if label not in featuresDict:
721                     featuresDict[label] = []
722                     sense = "+"
723                 else:
724                     sense = featuresDict[label][0][-1]
725
726                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
727
728         for gid in sortList:
729             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
730
731     return featuresDict
732
733
734 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
735                       normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
736                       binLength=-1):
737     """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
738         a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
739         or a different weight / tag.
740     """
741     index = 0
742     regionsBins = {}
743     regionsLen = {}
744
745     if defaultRegionFormat:
746         regionIDField = 0
747         startField = 1
748         stopField = 2
749         lengthField = 3
750     else:
751         startField = 0
752         stopField = 1
753         regionIDField = 2
754         lengthField = 3
755
756     senseField = 4
757
758     print "entering computeRegionBins"
759     if len(regionList) > 0:
760         for readID in regionList:
761             regionsBins[readID] = [0.] * bins
762     else:
763         for chrom in regionsByChromDict:
764             for regionTuple in regionsByChromDict[chrom]:
765                 regionID = regionTuple[regionIDField]
766                 regionsBins[regionID] = [0.] * bins
767
768     for chrom in hitDict:
769         if chrom not in regionsByChromDict:
770             continue
771
772         for regionTuple in regionsByChromDict[chrom]:
773             regionID = regionTuple[regionIDField]
774             regionsLen[regionID] = regionTuple[lengthField]
775
776         print "%s\n" % chrom
777         startRegion = 0
778         for read in hitDict[chrom]:
779             tagStart = read["start"]
780             weight = read["weight"]
781             index += 1
782             if index % 100000 == 0:
783                 print "read %d " % index,
784
785             stopPoint = tagStart + readlen
786             if startRegion < 0:
787                 startRegion = 0
788
789             for regionTuple in regionsByChromDict[chrom][startRegion:]:
790                 start = regionTuple[startField]
791                 stop = regionTuple[stopField]
792                 regionID = regionTuple[regionIDField]
793                 rlen = regionTuple[lengthField]
794                 try:
795                     rsense = regionTuple[senseField]
796                 except IndexError:
797                     rsense = "F"
798
799                 if tagStart > stop:
800                     startRegion += 1
801                     continue
802
803                 if start > stopPoint:
804                     startRegion -= 10
805                     break
806
807                 if start <= tagStart <= stop:
808                     if binLength < 1:
809                         regionBinLength = rlen / bins
810                     else:
811                         regionBinLength = binLength
812
813                     if rsense == "F":
814                         positionInRegion = tagStart - start
815                     else:
816                         positionInRegion = start + rlen - tagStart
817
818                     # we are relying on python's integer division quirk
819                     binID = positionInRegion / regionBinLength
820                     if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
821                         binID = 0
822                     elif fixedFirstBin > 0:
823                         binID = 1
824
825                     if binID >= bins:
826                         binID = bins - 1
827
828                     try:
829                         regionsBins[regionID][binID] += normalizedTag * weight
830                     except KeyError:
831                         print "%s %s" % (regionID, str(binID))
832
833                     stopPoint = stop
834
835     return (regionsBins, regionsLen)