development checkpoint
[erange.git] / commoncode.py
1 #
2 #  commoncode.py
3 #  ENRAGE
4 #
5
6 import ConfigParser
7 import os
8 import string
9 from time import strftime
10 from array import array
11 from collections import defaultdict
12 import Peak
13 from cistematic.core.geneinfo import geneinfoDB
14 from cistematic.genomes import Genome
15 import Region
16
17 commoncodeVersion = 5.6
18 currentRDSversion = 2.0
19
20 class ErangeError(Exception):
21     pass
22
23
24 def getReverseComplement(base):
25     revComp = {"A": "T",
26                "T": "A",
27                "G": "C",
28                "C": "G",
29                "N": "N"
30         }
31
32     return revComp[base]
33
34
35 def countDuplicatesInList(listToCheck):
36     tally = defaultdict(int)
37     for item in listToCheck:
38         tally[item] += 1
39
40     return tally.items()
41
42
43 def writeLog(logFile, messenger, message):
44     """ create a log file to write a message from a messenger or append to an existing file.
45     """
46     try:
47         logfile = open(logFile)
48     except IOError:
49         logfile = open(logFile, "w")
50     else:
51         logfile = open(logFile, "a")
52
53     logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
54     logfile.close()
55
56
57 def getGeneInfoDict(genome, cache=False):
58     idb = geneinfoDB(cache=cache)
59     if genome == "dmelanogaster":
60         geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
61     else:
62         geneinfoDict = idb.getallGeneInfo(genome)
63
64     return geneinfoDict
65
66
67 def getGeneAnnotDict(genome, inRAM=False):
68     return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
69
70
71 def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
72     genome = Genome(genomeName, inRAM=inRAM)
73     if extendGenome != "":
74         genome.extendFeatures(extendGenome, replace=replaceModels)
75
76     geneannotDict = genome.allAnnotInfo()
77
78     return geneannotDict
79
80
81 def getConfigParser(fileList=[]):
82     configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
83     for filename in fileList:
84         configFiles.append(filename)
85
86     config = ConfigParser.SafeConfigParser()
87     config.read(configFiles)
88
89     return config
90
91
92 def getConfigOption(parser, section, option, default=None):
93     try:
94         setting = parser.get(section, option)
95     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
96         setting = default
97
98     return setting
99
100
101 def getConfigIntOption(parser, section, option, default=None):
102     try:
103         setting = parser.getint(section, option)
104     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
105         setting = default
106
107     return setting
108
109
110 def getConfigFloatOption(parser, section, option, default=None):
111     try:
112         setting = parser.getfloat(section, option)
113     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
114         setting = default
115
116     return setting
117
118
119 def getConfigBoolOption(parser, section, option, default=None):
120     try:
121         setting = parser.getboolean(section, option)
122     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
123         setting = default
124
125     return setting
126
127
128 def getAllConfigSectionOptions(parser, section):
129     try:
130         setting = parser.items(section)
131     except ConfigParser.NoSectionError:
132         setting = []
133
134     return setting
135
136
137 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
138                      fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
139                      doMerge=True, keepPeak=False, returnTop=0):
140
141     """ returns a dictionary containing a list of merged overlapping regions by chromosome; 
142     can optionally filter regions that have a scoreField fewer than minHits.
143     Can also optionally return the label of each region, as well as the
144     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
145     Can return the top regions based on score if higher than minHits.
146     """
147     infile = open(regionfilename)
148     lines = infile.readlines()
149     regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
150                                        fullChrom, chromField, scoreField, pad, compact,
151                                        doMerge, keepPeak, returnTop)
152
153     infile.close()
154
155     return regions
156
157
158 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
159                      fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
160                      doMerge=True, keepPeak=False, returnTop=0):
161     """ returns a dictionary containing a list of merged overlapping regions by chromosome; 
162     can optionally filter regions that have a scoreField fewer than minHits.
163     Can also optionally return the label of each region, as well as the
164     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
165     Can return the top regions based on score if higher than minHits.
166     """
167     regions = {}
168     hasPvalue = 0
169     hasShift = 0
170     if 0 < returnTop < len(regionList):
171         scores = []
172         for regionEntry in regionList:
173             if regionEntry[0] == "#":
174                 if "pvalue" in regionEntry:
175                     hasPvalue = 1
176
177                 if "readShift" in regionEntry:
178                     hasShift = 1
179
180                 continue
181
182             fields = regionEntry.strip().split("\t")
183             hits = float(fields[scoreField].strip())
184             scores.append(hits)
185
186         scores.sort()
187         returnTop = -1 * returnTop 
188         minScore = scores[returnTop]
189         if minScore > minHits:
190             minHits = minScore
191
192     mergeCount = 0
193     chromField = int(chromField)
194     count = 0
195     for regionEntry in regionList:
196         if regionEntry[0] == "#":
197             if "pvalue" in regionEntry:
198                 hasPvalue = 1
199
200             if "readShift" in regionEntry:
201                 hasShift = 1
202
203             continue
204
205         fields = regionEntry.strip().split("\t")
206         if minHits >= 0:
207             try:
208                 hits = float(fields[scoreField].strip())
209             except (IndexError, ValueError):
210                 continue
211
212             if hits < minHits:
213                 continue
214
215         if compact:
216             (chrom, pos) = fields[chromField].split(":")
217             (front, back) = pos.split("-")
218             start = int(front)
219             stop = int(back)
220         elif chromField > 1:
221             label = string.join(fields[:chromField],"\t")
222             chrom = fields[chromField]
223             start = int(fields[chromField + 1]) - pad
224             stop = int(fields[chromField + 2]) + pad
225         else:
226             label = fields[0]
227             chrom = fields[1]
228             start = int(fields[2]) - pad
229             stop = int(fields[3]) + pad
230
231         if not fullChrom:
232             chrom = chrom[3:]
233
234         if keepPeak:
235             peakPos = int(fields[-2 - hasPvalue - hasShift])
236             peakHeight = float(fields[-1 - hasPvalue - hasShift])
237
238         if chrom not in regions:
239             regions[chrom] = []
240
241         merged = False
242
243         if doMerge and len(regions[chrom]) > 0:
244             for index in range(len(regions[chrom])):
245                 region = regions[chrom][index]
246                 rstart = region.start
247                 rstop = region.stop
248                 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
249                     if start < rstart:
250                         rstart = start
251
252                     if rstop < stop:
253                         rstop = stop
254
255                     if keepPeak:
256                         rpeakPos = region.peakPos
257                         rpeakHeight = region.peakHeight
258                         if peakHeight > rpeakHeight:
259                             rpeakHeight = peakHeight
260                             rpeakPos = peakPos
261
262                     regions[chrom][index].start = rstart
263                     regions[chrom][index].stop = rstop
264                     regions[chrom][index].length = abs(rstop - rstart)
265                     if keepLabel:
266                         regions[chrom][index].label = label
267
268                     if keepPeak:
269                         regions[chrom][index].peakPos = rpeakPos
270                         regions[chrom][index].peakHeight = rpeakHeight
271
272
273                     mergeCount += 1
274                     merged = True
275                     break
276
277         if not merged:
278             region = Region.Region(start, stop)
279             if keepLabel:
280                 region.label = label
281
282             if keepPeak:
283                 region.peakPos = peakPos
284                 region.peakHeight = peakHeight
285
286             regions[chrom].append(region)
287             count += 1
288
289         if verbose and (count % 100000 == 0):
290             print count
291
292     regionCount = 0
293     for chrom in regions:
294         regionCount += len(regions[chrom])
295         regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
296
297     if verbose:
298         print "merged %d times" % mergeCount
299         print "returning %d regions" % regionCount
300
301     return regions
302
303
304 def regionsOverlap(start, stop, rstart, rstop):
305     if start > stop:
306         (start, stop) = (stop, start)
307
308     if rstart > rstop:
309         (rstart, rstop) = (rstop, rstart)
310
311     return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
312
313
314 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
315     if start > stop:
316         (start, stop) = (stop, start)
317
318     if rstart > rstop:
319         (rstart, rstop) = (rstop, rstart)
320
321     return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
322
323
324 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
325              shift=0, maxshift=75):
326     """ find the peak in a list of reads (hitlist) in a region
327     of a given length and absolute start point. returns a
328     list of peaks, the number of hits, a triangular-smoothed
329     version of hitlist, and the number of reads that are
330     forward (plus) sense.
331     If doWeight is True, weight the reads accordingly.
332     If leftPlus is True, return the number of plus reads left of
333     the peak, taken to be the first TopPos position.
334     """
335
336     if shift == "auto":
337         shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
338
339     seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
340
341     # implementing a triangular smooth
342     smoothArray = array("f", [0.] * length)
343     for pos in range(2,length -2):
344         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
345
346     topPos = getPeakPositionList(smoothArray, length)
347     peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
348
349     if leftPlus:
350         numLeftPlus = 0
351         maxPos = topPos[0]
352         for read in regionArray:
353             if doWeight:
354                 weight = read["weight"]
355             else:
356                 weight = 1.0
357
358             currentPos = read["start"] - start
359             if currentPos <= maxPos and read["sense"] == "+":
360                 numLeftPlus += weight
361
362         peak.numLeftPlus = numLeftPlus
363
364     return peak
365
366
367 def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
368     bestShift = 0
369     lowestScore = 20000000000
370     for testShift in xrange(maxShift + 1):
371         shiftArray = array("f", [0.] * length)
372         for read in readList:
373             currentpos = read["start"] - start
374             if read["sense"] == "+":
375                 currentpos += testShift
376             else:
377                 currentpos -= testShift
378
379             if (currentpos < 1) or (currentpos >= length):
380                 continue
381
382             if useWeight:
383                 weight = read["weight"]
384             else:
385                 weight = 1.0
386
387             if read["sense"] == "+":
388                 shiftArray[currentpos] += weight
389             else:
390                 shiftArray[currentpos] -= weight
391
392         currentScore = 0
393         for score in shiftArray:
394             currentScore += abs(score)
395
396         print currentScore
397         if currentScore < lowestScore:
398             bestShift = testShift
399             lowestScore = currentScore
400
401     return bestShift
402
403
404 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
405     seqArray = array("f", [0.] * length)
406     numHits = 0.
407     numPlus = 0.
408     regionArray = []
409     for read in hitList:
410         currentpos = read["start"] - start
411         if read["sense"] == "+":
412             currentpos += shift
413         else:
414             currentpos -= shift
415
416         if (currentpos <  1 - readlen) or (currentpos >= length):
417             continue
418
419         if doWeight:
420             weight = read["weight"]
421         else:
422             weight = 1.0
423
424         numHits += weight
425         if leftPlus:
426             regionArray.append(read)
427
428         hitIndex = 0
429         while currentpos < 0:
430             hitIndex += 1
431             currentpos += 1
432
433         while hitIndex < readlen and currentpos < length:
434             seqArray[currentpos] += weight
435             hitIndex += 1
436             currentpos += 1
437
438         if read["sense"] == "+":
439             numPlus += weight
440
441     return seqArray, regionArray, numHits, numPlus
442
443
444 def getPeakPositionList(smoothArray, length):
445     topNucleotide = 0
446     peakList = []
447     for currentpos in xrange(length):
448         if topNucleotide < smoothArray[currentpos]:
449             topNucleotide = smoothArray[currentpos]
450             peakList = [currentpos]
451         elif topNucleotide == smoothArray[currentpos]:
452             peakList.append(currentpos)
453
454     return peakList
455
456
457 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
458                            restrictList=[], regionComplement=False, maxStop=250000000):
459     """ return a dictionary of cistematic gene features. Requires
460     cistematic, obviously. Can filter-out pseudogenes. Will use
461     additional regions dict to supplement gene models, if available.
462     Can restrict output to a list of GIDs.
463     If regionComplement is set to true, returns the regions *outside* of the
464     calculated boundaries, which is useful for retrieving intronic and
465     intergenic regions. maxStop is simply used to define the uppermost
466     boundary of the complement region.
467     """ 
468     featuresDict = genomeObject.getallGeneFeatures()
469     restrictGID = False
470     if len(restrictList) > 0:
471         restrictGID = True
472
473     if len(additionalRegionsDict) > 0:
474         sortList = []
475         for chrom in additionalRegionsDict:
476             for region in additionalRegionsDict[chrom]:
477                 label = region.label
478                 if label not in sortList:
479                     sortList.append(label)
480
481                 if label not in featuresDict:
482                     featuresDict[label] = []
483                     sense = "+"
484                 else:
485                     sense = featuresDict[label][0][-1]
486
487                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
488
489         for gid in sortList:
490             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
491
492     featuresByChromDict = {}
493     for gid in featuresDict:
494         if restrictGID and gid not in restrictList:
495             continue
496
497         featureList = featuresDict[gid]
498         newFeatureList = []
499         isPseudo = False
500         for (ftype, chrom, start, stop, sense) in featureList:
501             if ftype == "PSEUDO":
502                 isPseudo = True
503
504             if (start, stop, ftype) not in newFeatureList:
505                 notContained = True
506                 containedList = []
507                 for (fstart, fstop, ftype2) in newFeatureList:
508                     if start >= fstart and stop <= fstop:
509                         notContained = False
510
511                     if start < fstart and stop > fstop:
512                         containedList.append((fstart, fstop))
513
514                 if len(containedList) > 0:
515                     newFList = []
516                     notContained = True
517                     for (fstart, fstop, ftype2) in newFeatureList:
518                         if (fstart, fstop) not in containedList:
519                             newFList.append((fstart, fstop, ftype2))
520                             if start >= fstart and stop <= fstop:
521                                 notContained = False
522
523                     newFeatureList = newFList
524                 if notContained:
525                     newFeatureList.append((start, stop, ftype))
526
527         if ignorePseudo and isPseudo:
528             continue
529
530         if chrom not in featuresByChromDict:
531             featuresByChromDict[chrom] = []
532
533         for (start, stop, ftype) in newFeatureList:
534             featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
535
536     for chrom in featuresByChromDict:
537         featuresByChromDict[chrom].sort()
538
539     if regionComplement:
540         complementByChromDict = {}
541         complementIndex = 0
542         for chrom in featuresByChromDict:
543             complementByChromDict[chrom] = []
544             listLength = len(featuresByChromDict[chrom])
545             if listLength > 0:
546                 currentStart = 0
547                 for index in range(listLength):
548                     currentStop = featuresByChromDict[chrom][index][0]
549                     complementIndex += 1
550                     if currentStart < currentStop:
551                         complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
552
553                     currentStart = featuresByChromDict[chrom][index][1]
554
555                 currentStop = maxStop
556                 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
557
558         return (featuresByChromDict, complementByChromDict)
559     else:
560         return featuresByChromDict
561
562
563 def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
564                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
565                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
566     """ return a dictionary of gene loci. Can be used to retrieve additional
567     sequence upstream or downstream of gene, up to the next gene. Requires
568     cistematic, obviously.
569     Can filter-out pseudogenes and use additional regions outside of existing
570     gene models. Use upstreamSpanTSS to overlap half of the upstream region
571     over the TSS.
572     If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
573     lengthCDS < 0bp, return only the last X bp from CDS.
574     """ 
575     locusByChromDict = {}
576     if upstream == 0 and downstream == 0 and not useCDS:
577         print "getLocusByChromDict: asked for no sequence - returning empty dict"
578         return locusByChromDict
579     elif upstream > 0 and downstream > 0 and not useCDS:
580         print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
581         return locusByChromDict
582     elif lengthCDS != 0 and not useCDS:
583         print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
584         return locusByChromDict
585     elif upstreamSpanTSS and lengthCDS != 0:
586         print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
587         return locusByChromDict
588     elif lengthCDS > 0 and downstream > 0:
589         print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
590         return locusByChromDict
591     elif lengthCDS < 0 and upstream > 0:
592         print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
593         return locusByChromDict
594
595     genomeName = genome.genome
596     featuresDict = getGeneFeatures(genome, additionalRegionsDict)
597     for gid in featuresDict:
598         featureList = featuresDict[gid]
599         newFeatureList = []
600         for (ftype, chrom, start, stop, sense) in featureList:
601             newFeatureList.append((start, stop))
602
603         if ignorePseudo and ftype == "PSEUDO":
604             continue
605
606         newFeatureList.sort()
607
608         sense = featureList[0][-1]
609         gstart = newFeatureList[0][0]
610         gstop = newFeatureList[-1][1]
611         glen = abs(gstart - gstop)
612         if sense == "F":
613             if not useCDS and upstream > 0:
614                 if upstreamSpanTSS:
615                     if gstop > (gstart + upstream / 2):
616                         gstop = gstart + upstream / 2
617                 else:
618                     gstop = gstart
619             elif not useCDS and downstream > 0:
620                 gstart = gstop
621
622             if upstream > 0:
623                 if upstreamSpanTSS:
624                     distance = upstream / 2
625                 else:
626                     distance = upstream
627
628                 if adjustToNeighbor:
629                     nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
630                     if nextGene < distance * 2:
631                         distance = nextGene / 2
632
633                 distance = max(distance, 1)
634                 gstart -= distance
635
636             if downstream > 0:
637                 distance = downstream
638                 if adjustToNeighbor:
639                     nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
640                     if nextGene < downstream * 2:
641                         distance = nextGene / 2
642
643                 distance = max(distance, 1)
644                 gstop += distance
645
646             if 0 < lengthCDS < glen:
647                 gstop = newFeatureList[0][0] + lengthCDS
648
649             if lengthCDS < 0 and abs(lengthCDS) < glen:
650                 gstart = newFeatureList[-1][1] + lengthCDS
651         else:
652             if not useCDS and upstream > 0:
653                 if upstreamSpanTSS:
654                     if gstart < (gstop - upstream / 2):
655                         gstart = gstop - upstream / 2
656                 else:
657                     gstart = gstop
658             elif not useCDS and downstream > 0:
659                     gstop = gstart
660
661             if upstream > 0:
662                 if upstreamSpanTSS:
663                     distance = upstream /2
664                 else:
665                     distance = upstream
666
667                 if adjustToNeighbor:
668                     nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
669                     if nextGene < distance * 2:
670                         distance = nextGene / 2
671
672                 distance = max(distance, 1)
673                 gstop += distance
674
675             if downstream > 0:
676                 distance = downstream
677                 if adjustToNeighbor:
678                     nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
679                     if nextGene < downstream * 2:
680                         distance = nextGene / 2
681
682                 distance = max(distance, 1)
683                 gstart -= distance
684
685             if 0 < lengthCDS < glen:
686                 gstart = newFeatureList[-1][-1] - lengthCDS
687
688             if lengthCDS < 0 and abs(lengthCDS) < glen:
689                 gstop = newFeatureList[0][0] - lengthCDS
690
691         if chrom not in locusByChromDict:
692             locusByChromDict[chrom] = []
693
694         if keepSense:
695             locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
696         else:
697             locusByChromDict[chrom].append((gstart, gstop, gid, glen))
698
699     for chrom in locusByChromDict:
700         locusByChromDict[chrom].sort()
701
702     return locusByChromDict
703
704
705 def getGeneFeatures(genome, additionalRegionsDict):
706     featuresDict = genome.getallGeneFeatures()
707     if len(additionalRegionsDict) > 0:
708         sortList = []
709         for chrom in additionalRegionsDict:
710             for region in additionalRegionsDict[chrom]:
711                 label = region.label
712                 if label not in sortList:
713                     sortList.append(label)
714
715                 if label not in featuresDict:
716                     featuresDict[label] = []
717                     sense = "+"
718                 else:
719                     sense = featuresDict[label][0][-1]
720
721                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
722
723         for gid in sortList:
724             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
725
726     return featuresDict
727
728
729 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
730                       normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
731                       binLength=-1):
732     """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
733         a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
734         or a different weight / tag.
735     """
736     index = 0
737     regionsBins = {}
738     regionsLen = {}
739
740     if defaultRegionFormat:
741         regionIDField = 0
742         startField = 1
743         stopField = 2
744         lengthField = 3
745     else:
746         startField = 0
747         stopField = 1
748         regionIDField = 2
749         lengthField = 3
750
751     senseField = 4
752
753     print "entering computeRegionBins"
754     if len(regionList) > 0:
755         for readID in regionList:
756             regionsBins[readID] = [0.] * bins
757     else:
758         for chrom in regionsByChromDict:
759             for regionTuple in regionsByChromDict[chrom]:
760                 regionID = regionTuple[regionIDField]
761                 regionsBins[regionID] = [0.] * bins
762
763     for chrom in hitDict:
764         if chrom not in regionsByChromDict:
765             continue
766
767         for regionTuple in regionsByChromDict[chrom]:
768             regionID = regionTuple[regionIDField]
769             regionsLen[regionID] = regionTuple[lengthField]
770
771         print "%s\n" % chrom
772         startRegion = 0
773         for read in hitDict[chrom]:
774             tagStart = read["start"]
775             weight = read["weight"]
776             index += 1
777             if index % 100000 == 0:
778                 print "read %d " % index,
779
780             stopPoint = tagStart + readlen
781             if startRegion < 0:
782                 startRegion = 0
783
784             for regionTuple in regionsByChromDict[chrom][startRegion:]:
785                 start = regionTuple[startField]
786                 stop = regionTuple[stopField]
787                 regionID = regionTuple[regionIDField]
788                 rlen = regionTuple[lengthField]
789                 try:
790                     rsense = regionTuple[senseField]
791                 except IndexError:
792                     rsense = "F"
793
794                 if tagStart > stop:
795                     startRegion += 1
796                     continue
797
798                 if start > stopPoint:
799                     startRegion -= 10
800                     break
801
802                 if start <= tagStart <= stop:
803                     if binLength < 1:
804                         regionBinLength = rlen / bins
805                     else:
806                         regionBinLength = binLength
807
808                     startdist = tagStart - start
809                     if rsense == "F":
810                         # we are relying on python's integer division quirk
811                         binID = startdist / regionBinLength
812                         if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
813                             binID = 0
814                         elif fixedFirstBin > 0:
815                             binID = 1
816
817                         if binID >= bins:
818                             binID = bins - 1
819
820                         try:
821                             regionsBins[regionID][binID] += normalizedTag * weight
822                         except KeyError:
823                             print "%s %s" % (regionID, str(binID))
824                     else:
825                         rdist = rlen - startdist
826                         binID = rdist / regionBinLength
827                         if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
828                             binID = 0
829                         elif fixedFirstBin > 0:
830                             binID = 1
831
832                         if binID >= bins:
833                             binID = bins - 1
834
835                         try:
836                             regionsBins[regionID][binID] += normalizedTag * weight
837                         except KeyError:
838                             print "%s %s" % (regionID, str(binID))
839
840                     stopPoint = stop
841
842     return (regionsBins, regionsLen)