erange version 4.0a dev release
[erange.git] / commoncode.py
1 #
2 #  commoncode.py
3 #  ENRAGE
4 #
5
6 import ConfigParser
7 import os
8 import string
9 from time import strftime
10 from array import array
11 from collections import defaultdict
12 import Peak
13 from cistematic.core.geneinfo import geneinfoDB
14 from cistematic.genomes import Genome
15 import Region
16
17 commoncodeVersion = 5.6
18 currentRDSversion = 2.0
19
20
21 def getReverseComplement(base):
22     revComp = {"A": "T",
23                "T": "A",
24                "G": "C",
25                "C": "G",
26                "N": "N"
27         }
28
29     return revComp[base]
30
31
32 def countDuplicatesInList(listToCheck):
33     tally = defaultdict(int)
34     for item in listToCheck:
35         tally[item] += 1
36
37     return tally.items()
38
39
40 def writeLog(logFile, messenger, message):
41     """ create a log file to write a message from a messenger or append to an existing file.
42     """
43     try:
44         logfile = open(logFile)
45     except IOError:
46         logfile = open(logFile, "w")
47     else:
48         logfile = open(logFile, "a")
49
50     logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
51     logfile.close()
52
53
54 def getGeneInfoDict(genome, cache=False):
55     idb = geneinfoDB(cache=cache)
56     if genome == "dmelanogaster":
57         geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
58     else:
59         geneinfoDict = idb.getallGeneInfo(genome)
60
61     return geneinfoDict
62
63
64 def getGeneAnnotDict(genome, inRAM=False):
65     return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
66
67
68 def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False):
69     hg = Genome(genome, inRAM=inRAM)
70     if extendGenome != "":
71         hg.extendFeatures(extendGenome, replace=replaceModels)
72
73     geneannotDict = hg.allAnnotInfo()
74
75     return geneannotDict
76
77
78 def getConfigParser(fileList=[]):
79     configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
80     for filename in fileList:
81         configFiles.append(filename)
82
83     config = ConfigParser.SafeConfigParser()
84     config.read(configFiles)
85
86     return config
87
88
89 def getConfigOption(parser, section, option, default=None):
90     try:
91         setting = parser.get(section, option)
92     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
93         setting = default
94
95     return setting
96
97
98 def getConfigIntOption(parser, section, option, default=None):
99     try:
100         setting = parser.getint(section, option)
101     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
102         setting = default
103
104     return setting
105
106
107 def getConfigFloatOption(parser, section, option, default=None):
108     try:
109         setting = parser.getfloat(section, option)
110     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
111         setting = default
112
113     return setting
114
115
116 def getConfigBoolOption(parser, section, option, default=None):
117     try:
118         setting = parser.getboolean(section, option)
119     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
120         setting = default
121
122     return setting
123
124
125 def getAllConfigSectionOptions(parser, section):
126     try:
127         setting = parser.items(section)
128     except ConfigParser.NoSectionError:
129         setting = []
130
131     return setting
132
133
134 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
135                      fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
136                      doMerge=True, keepPeak=False, returnTop=0):
137
138     """ returns a dictionary containing a list of merged overlapping regions by chromosome; 
139     can optionally filter regions that have a scoreField fewer than minHits.
140     Can also optionally return the label of each region, as well as the
141     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
142     Can return the top regions based on score if higher than minHits.
143     """
144     infile = open(regionfilename)
145     lines = infile.readlines()
146     regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
147                                        fullChrom, chromField, scoreField, pad, compact,
148                                        doMerge, keepPeak, returnTop)
149
150     infile.close()
151
152     return regions
153
154
155 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
156                      fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
157                      doMerge=True, keepPeak=False, returnTop=0):
158     """ returns a dictionary containing a list of merged overlapping regions by chromosome; 
159     can optionally filter regions that have a scoreField fewer than minHits.
160     Can also optionally return the label of each region, as well as the
161     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
162     Can return the top regions based on score if higher than minHits.
163     """
164     regions = {}
165     hasPvalue = 0
166     hasShift = 0
167     if 0 < returnTop < len(regionList):
168         scores = []
169         for regionEntry in regionList:
170             if regionEntry[0] == "#":
171                 if "pvalue" in regionEntry:
172                     hasPvalue = 1
173
174                 if "readShift" in regionEntry:
175                     hasShift = 1
176
177                 continue
178
179             fields = regionEntry.strip().split("\t")
180             hits = float(fields[scoreField].strip())
181             scores.append(hits)
182
183         scores.sort()
184         returnTop = -1 * returnTop 
185         minScore = scores[returnTop]
186         if minScore > minHits:
187             minHits = minScore
188
189     mergeCount = 0
190     chromField = int(chromField)
191     count = 0
192     #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
193     #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
194     #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
195     #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
196     #      be merged with A but that will exit the loop and the merge with C will be missed.
197     for regionEntry in regionList:
198         if regionEntry[0] == "#":
199             if "pvalue" in regionEntry:
200                 hasPvalue = 1
201
202             if "readShift" in regionEntry:
203                 hasShift = 1
204
205             continue
206
207         fields = regionEntry.strip().split("\t")
208         if minHits >= 0:
209             try:
210                 hits = float(fields[scoreField].strip())
211             except (IndexError, ValueError):
212                 continue
213
214             if hits < minHits:
215                 continue
216
217         if compact:
218             (chrom, pos) = fields[chromField].split(":")
219             (front, back) = pos.split("-")
220             start = int(front)
221             stop = int(back)
222         elif chromField > 1:
223             label = string.join(fields[:chromField],"\t")
224             chrom = fields[chromField]
225             start = int(fields[chromField + 1]) - pad
226             stop = int(fields[chromField + 2]) + pad
227         else:
228             label = fields[0]
229             chrom = fields[1]
230             start = int(fields[2]) - pad
231             stop = int(fields[3]) + pad
232
233         if not fullChrom:
234             chrom = chrom[3:]
235
236         if keepPeak:
237             peakPos = int(fields[-2 - hasPvalue - hasShift])
238             peakHeight = float(fields[-1 - hasPvalue - hasShift])
239
240         if chrom not in regions:
241             regions[chrom] = []
242
243         merged = False
244
245         if doMerge and len(regions[chrom]) > 0:
246             for index in range(len(regions[chrom])):
247                 region = regions[chrom][index]
248                 rstart = region.start
249                 rstop = region.stop
250                 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
251                     if start < rstart:
252                         rstart = start
253
254                     if rstop < stop:
255                         rstop = stop
256
257                     if keepPeak:
258                         rpeakPos = region.peakPos
259                         rpeakHeight = region.peakHeight
260                         if peakHeight > rpeakHeight:
261                             rpeakHeight = peakHeight
262                             rpeakPos = peakPos
263
264                     regions[chrom][index].start = rstart
265                     regions[chrom][index].stop = rstop
266                     regions[chrom][index].length = abs(rstop - rstart)
267                     if keepLabel:
268                         regions[chrom][index].label = label
269
270                     if keepPeak:
271                         regions[chrom][index].peakPos = rpeakPos
272                         regions[chrom][index].peakHeight = rpeakHeight
273
274
275                     mergeCount += 1
276                     merged = True
277                     break
278
279         if not merged:
280             region = Region.Region(start, stop)
281             if keepLabel:
282                 region.label = label
283
284             if keepPeak:
285                 region.peakPos = peakPos
286                 region.peakHeight = peakHeight
287
288             regions[chrom].append(region)
289             count += 1
290
291         if verbose and (count % 100000 == 0):
292             print count
293
294     regionCount = 0
295     for chrom in regions:
296         regionCount += len(regions[chrom])
297         regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
298
299     if verbose:
300         print "merged %d times" % mergeCount
301         print "returning %d regions" % regionCount
302
303     return regions
304
305
306 def regionsOverlap(start, stop, rstart, rstop):
307     if start > stop:
308         (start, stop) = (stop, start)
309
310     if rstart > rstop:
311         (rstart, rstop) = (rstop, rstart)
312
313     return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
314
315
316 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
317     if start > stop:
318         (start, stop) = (stop, start)
319
320     if rstart > rstop:
321         (rstart, rstop) = (rstop, rstart)
322
323     return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
324
325
326 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
327              shift=0, maxshift=75):
328     """ find the peak in a list of reads (hitlist) in a region
329     of a given length and absolute start point. returns a
330     list of peaks, the number of hits, a triangular-smoothed
331     version of hitlist, and the number of reads that are
332     forward (plus) sense.
333     If doWeight is True, weight the reads accordingly.
334     If leftPlus is True, return the number of plus reads left of
335     the peak, taken to be the first TopPos position.
336     """
337
338     if shift == "auto":
339         shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
340
341     seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
342
343     # implementing a triangular smooth
344     smoothArray = array("f", [0.] * length)
345     for pos in range(2,length -2):
346         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
347
348     topPos = getPeakPositionList(smoothArray, length)
349     peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
350
351     if leftPlus:
352         numLeftPlus = 0
353         maxPos = topPos[0]
354         for read in regionArray:
355             if doWeight:
356                 weight = read["weight"]
357             else:
358                 weight = 1.0
359
360             currentPos = read["start"] - start
361             if currentPos <= maxPos and read["sense"] == "+":
362                 numLeftPlus += weight
363
364         peak.numLeftPlus = numLeftPlus
365
366     return peak
367
368
369 def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
370     bestShift = 0
371     lowestScore = 20000000000
372     for testShift in xrange(maxShift + 1):
373         shiftArray = array("f", [0.] * length)
374         for read in hitList:
375             currentpos = read["start"] - start
376             if read["sense"] == "+":
377                 currentpos += testShift
378             else:
379                 currentpos -= testShift
380
381             if (currentpos < 1) or (currentpos >= length):
382                 continue
383
384             if doWeight:
385                 weight = read["weight"]
386             else:
387                 weight = 1.0
388
389             if read["sense"] == "+":
390                 shiftArray[currentpos] += weight
391             else:
392                 shiftArray[currentpos] -= weight
393
394         currentScore = 0
395         for score in shiftArray:
396             currentScore += abs(score)
397
398         print currentScore
399         if currentScore < lowestScore:
400             bestShift = testShift
401             lowestScore = currentScore
402
403     return bestShift
404
405
406 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
407     seqArray = array("f", [0.] * length)
408     numHits = 0.
409     numPlus = 0.
410     regionArray = []
411     for read in hitList:
412         currentpos = read["start"] - start
413         if read["sense"] == "+":
414             currentpos += shift
415         else:
416             currentpos -= shift
417
418         if (currentpos <  1 - readlen) or (currentpos >= length):
419             continue
420
421         if doWeight:
422             weight = read["weight"]
423         else:
424             weight = 1.0
425
426         numHits += weight
427         if leftPlus:
428             regionArray.append(read)
429
430         hitIndex = 0
431         while currentpos < 0:
432             hitIndex += 1
433             currentpos += 1
434
435         while hitIndex < readlen and  currentpos < length:
436             seqArray[currentpos] += weight
437             hitIndex += 1
438             currentpos += 1
439
440         if read["sense"] == "+":
441             numPlus += weight
442
443     return seqArray, regionArray, numHits, numPlus
444
445
446 def getPeakPositionList(smoothArray, length):
447     topNucleotide = 0
448     peakList = []
449     for currentpos in xrange(length):
450         if topNucleotide < smoothArray[currentpos]:
451             topNucleotide = smoothArray[currentpos]
452             peakList = [currentpos]
453         elif topNucleotide  == smoothArray[currentpos]:
454             peakList.append(currentpos)
455
456     return peakList
457
458
459 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
460                            restrictList=[], regionComplement=False, maxStop=250000000):
461     """ return a dictionary of cistematic gene features. Requires
462     cistematic, obviously. Can filter-out pseudogenes. Will use
463     additional regions dict to supplement gene models, if available.
464     Can restrict output to a list of GIDs.
465     If regionComplement is set to true, returns the regions *outside* of the
466     calculated boundaries, which is useful for retrieving intronic and
467     intergenic regions. maxStop is simply used to define the uppermost
468     boundary of the complement region.
469     """ 
470     featuresDict = genomeObject.getallGeneFeatures()
471     restrictGID = False
472     if len(restrictList) > 0:
473         restrictGID = True
474
475     if len(additionalRegionsDict) > 0:
476         sortList = []
477         for chrom in additionalRegionsDict:
478             for region in additionalRegionsDict[chrom]:
479                 label = region.label
480                 if label not in sortList:
481                     sortList.append(label)
482
483                 if label not in featuresDict:
484                     featuresDict[label] = []
485                     sense = "+"
486                 else:
487                     sense = featuresDict[label][0][-1]
488
489                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
490
491         for gid in sortList:
492             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
493
494     featuresByChromDict = {}
495     for gid in featuresDict:
496         if restrictGID and gid not in restrictList:
497             continue
498
499         featureList = featuresDict[gid]
500         newFeatureList = []
501         isPseudo = False
502         for (ftype, chrom, start, stop, sense) in featureList:
503             if ftype == "PSEUDO":
504                 isPseudo = True
505
506             if (start, stop, ftype) not in newFeatureList:
507                 notContained = True
508                 containedList = []
509                 for (fstart, fstop, ftype2) in newFeatureList:
510                     if start >= fstart and stop <= fstop:
511                         notContained = False
512
513                     if start < fstart and stop > fstop:
514                         containedList.append((fstart, fstop))
515
516                 if len(containedList) > 0:
517                     newFList = []
518                     notContained = True
519                     for (fstart, fstop, ftype2) in newFeatureList:
520                         if (fstart, fstop) not in containedList:
521                             newFList.append((fstart, fstop, ftype2))
522                             if start >= fstart and stop <= fstop:
523                                 notContained = False
524
525                     newFeatureList = newFList
526                 if notContained:
527                     newFeatureList.append((start, stop, ftype))
528
529         if ignorePseudo and isPseudo:
530             continue
531
532         if chrom not in featuresByChromDict:
533             featuresByChromDict[chrom] = []
534
535         for (start, stop, ftype) in newFeatureList:
536             featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
537
538     for chrom in featuresByChromDict:
539         featuresByChromDict[chrom].sort()
540
541     if regionComplement:
542         complementByChromDict = {}
543         complementIndex = 0
544         for chrom in featuresByChromDict:
545             complementByChromDict[chrom] = []
546             listLength = len(featuresByChromDict[chrom])
547             if listLength > 0:
548                 currentStart = 0
549                 for index in range(listLength):
550                     currentStop = featuresByChromDict[chrom][index][0]
551                     complementIndex += 1
552                     if currentStart < currentStop:
553                         complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
554
555                     currentStart = featuresByChromDict[chrom][index][1]
556
557                 currentStop = maxStop
558                 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
559
560         return (featuresByChromDict, complementByChromDict)
561     else:
562         return featuresByChromDict
563
564
565 def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
566                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
567                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
568     """ return a dictionary of gene loci. Can be used to retrieve additional
569     sequence upstream or downstream of gene, up to the next gene. Requires
570     cistematic, obviously.
571     Can filter-out pseudogenes and use additional regions outside of existing
572     gene models. Use upstreamSpanTSS to overlap half of the upstream region
573     over the TSS.
574     If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
575     lengthCDS < 0bp, return only the last X bp from CDS.
576     """ 
577     locusByChromDict = {}
578     if upstream == 0 and downstream == 0 and not useCDS:
579         print "getLocusByChromDict: asked for no sequence - returning empty dict"
580         return locusByChromDict
581     elif upstream > 0 and downstream > 0 and not useCDS:
582         print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
583         return locusByChromDict
584     elif lengthCDS != 0 and not useCDS:
585         print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
586         return locusByChromDict
587     elif upstreamSpanTSS and lengthCDS != 0:
588         print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
589         return locusByChromDict
590     elif lengthCDS > 0 and downstream > 0:
591         print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
592         return locusByChromDict
593     elif lengthCDS < 0 and upstream > 0:
594         print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
595         return locusByChromDict
596
597     genome = genomeObject.genome
598     featuresDict = genomeObject.getallGeneFeatures()
599     if len(additionalRegionsDict) > 0:
600         sortList = []
601         for chrom in additionalRegionsDict:
602             for region in additionalRegionsDict[chrom]:
603                 label = region.label
604                 if label not in sortList:
605                     sortList.append(label)
606
607                 if label not in featuresDict:
608                     featuresDict[label] = []
609                     sense = "+"
610                 else:
611                     sense = featuresDict[label][0][-1]
612
613                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
614
615         for gid in sortList:
616             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
617
618     for gid in featuresDict:
619         featureList = featuresDict[gid]
620         newFeatureList = []
621         for (ftype, chrom, start, stop, sense) in featureList:
622             newFeatureList.append((start, stop))
623
624         if ignorePseudo and ftype == "PSEUDO":
625             continue
626
627         newFeatureList.sort()
628
629         sense = featureList[0][-1]
630         gstart = newFeatureList[0][0]
631         gstop = newFeatureList[-1][1]
632         glen = abs(gstart - gstop)
633         if sense == "F":
634             if not useCDS and upstream > 0:
635                 if upstreamSpanTSS:
636                     if gstop > (gstart + upstream / 2):
637                         gstop = gstart + upstream / 2
638                 else:
639                     gstop = gstart
640             elif not useCDS and downstream > 0:
641                 gstart = gstop
642
643             if upstream > 0:
644                 if upstreamSpanTSS:
645                     distance = upstream / 2
646                 else:
647                     distance = upstream
648
649                 if adjustToNeighbor:
650                     nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2)
651                     if nextGene < distance * 2:
652                         distance = nextGene / 2
653
654                 if distance < 1:
655                     distance = 1
656
657                 gstart -= distance
658
659             if downstream > 0:
660                 distance = downstream
661                 if adjustToNeighbor:
662                     nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2)
663                     if nextGene < downstream * 2:
664                         distance = nextGene / 2
665
666                 if distance < 1:
667                     distance = 1
668
669                 gstop += distance
670
671             if lengthCDS > 0:
672                 if lengthCDS < glen:
673                     gstop = newFeatureList[0][0] + lengthCDS
674
675             if lengthCDS < 0:
676                 if abs(lengthCDS) < glen:
677                     gstart = newFeatureList[-1][1] + lengthCDS
678         else:
679             if not useCDS and upstream > 0:
680                 if upstreamSpanTSS:
681                     if gstart < (gstop - upstream / 2):
682                         gstart = gstop - upstream / 2
683                 else:
684                     gstart = gstop
685             elif not useCDS and downstream > 0:
686                     gstop = gstart
687
688             if upstream > 0:
689                 if upstreamSpanTSS:
690                     distance = upstream /2
691                 else:
692                     distance = upstream
693
694                 if adjustToNeighbor:
695                     nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2)
696                     if nextGene < distance * 2:
697                         distance = nextGene / 2
698
699                 if distance < 1:
700                     distance = 1
701
702                 gstop += distance
703
704             if downstream > 0:
705                 distance = downstream
706                 if adjustToNeighbor:
707                     nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2)
708                     if nextGene < downstream * 2:
709                         distance = nextGene / 2
710
711                 if distance < 1:
712                     distance = 1
713
714                 gstart -= distance
715
716             if lengthCDS > 0:
717                 if lengthCDS < glen:
718                     gstart = newFeatureList[-1][-1] - lengthCDS
719
720             if lengthCDS < 0:
721                 if abs(lengthCDS) < glen:
722                     gstop = newFeatureList[0][0] - lengthCDS
723
724         glen = abs(gstop - gstart)
725         if chrom not in locusByChromDict:
726             locusByChromDict[chrom] = []
727
728         if keepSense:
729             locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
730         else:
731             locusByChromDict[chrom].append((gstart, gstop, gid, glen))
732
733     for chrom in locusByChromDict:
734         locusByChromDict[chrom].sort()
735
736     return locusByChromDict
737
738
739 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
740                       normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
741                       binLength=-1):
742     """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
743         a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
744         or a different weight / tag.
745     """
746     index = 0
747     regionsBins = {}
748     regionsLen = {}
749
750     if defaultRegionFormat:
751         regionIDField = 0
752         startField = 1
753         stopField = 2
754         lengthField = 3
755     else:
756         startField = 0
757         stopField = 1
758         regionIDField = 2
759         lengthField = 3
760
761     senseField = 4
762
763     print "entering computeRegionBins"
764     if len(regionList) > 0:
765         for readID in regionList:
766             regionsBins[readID] = [0.] * bins
767     else:
768         for chrom in regionsByChromDict:
769             for regionTuple in regionsByChromDict[chrom]:
770                 regionID = regionTuple[regionIDField]
771                 regionsBins[regionID] = [0.] * bins
772
773     for chrom in hitDict:
774         if chrom not in regionsByChromDict:
775             continue
776
777         for regionTuple in regionsByChromDict[chrom]:
778             regionID = regionTuple[regionIDField]
779             regionsLen[regionID] = regionTuple[lengthField]
780
781         print "%s\n" % chrom
782         startRegion = 0
783         for read in hitDict[chrom]:
784             tagStart = read["start"]
785             weight = read["weight"]
786             index += 1
787             if index % 100000 == 0:
788                 print "read %d " % index,
789
790             stopPoint = tagStart + readlen
791             if startRegion < 0:
792                 startRegion = 0
793
794             for regionTuple in regionsByChromDict[chrom][startRegion:]:
795                 start = regionTuple[startField]
796                 stop = regionTuple[stopField]
797                 regionID = regionTuple[regionIDField]
798                 rlen = regionTuple[lengthField]
799                 try:
800                     rsense = regionTuple[senseField]
801                 except:
802                     rsense = "F"
803
804                 if tagStart > stop:
805                     startRegion += 1
806                     continue
807
808                 if start > stopPoint:
809                     startRegion -= 10
810                     break
811
812                 if start <= tagStart <= stop:
813                     if binLength < 1:
814                         regionBinLength = rlen / bins
815                     else:
816                         regionBinLength = binLength
817
818                     startdist = tagStart - start
819                     if rsense == "F":
820                         # we are relying on python's integer division quirk
821                         binID = startdist / regionBinLength
822                         if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
823                             binID = 0
824                         elif fixedFirstBin > 0:
825                             binID = 1
826
827                         if binID >= bins:
828                             binID = bins - 1
829
830                         try:
831                             regionsBins[regionID][binID] += normalizedTag * weight
832                         except KeyError:
833                             print "%s %s" % (regionID, str(binID))
834                     else:
835                         rdist = rlen - startdist
836                         binID = rdist / regionBinLength
837                         if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
838                             binID = 0
839                         elif fixedFirstBin > 0:
840                             binID = 1
841
842                         if binID >= bins:
843                             binID = bins - 1
844
845                         try:
846                             regionsBins[regionID][binID] += normalizedTag * weight
847                         except KeyError:
848                             print "%s %s" % (regionID, str(binID))
849
850                     stopPoint = stop
851
852     return (regionsBins, regionsLen)