development release: conversion of ReadDataset to use BAM files
[erange.git] / commoncode.py
1 import ConfigParser
2 import os
3 import string
4 from time import strftime
5 from array import array
6 from collections import defaultdict
7 import Peak
8 from cistematic.core.geneinfo import geneinfoDB
9 from cistematic.genomes import Genome
10 import Region
11
12 commoncodeVersion = 5.6
13 currentRDSversion = 2.0
14
15 class ErangeError(Exception):
16     pass
17
18
19 def getReverseComplement(base):
20     revComp = {"A": "T",
21                "T": "A",
22                "G": "C",
23                "C": "G",
24                "N": "N"
25         }
26
27     return revComp[base]
28
29
30 def countDuplicatesInList(listToCheck):
31     tally = defaultdict(int)
32     for item in listToCheck:
33         tally[item] += 1
34
35     return tally.items()
36
37
38 def writeLog(logFile, messenger, message):
39     """ create a log file to write a message from a messenger or append to an existing file.
40     """
41     try:
42         logfile = open(logFile)
43     except IOError:
44         logfile = open(logFile, "w")
45     else:
46         logfile = open(logFile, "a")
47
48     logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
49     logfile.close()
50
51
52 def getGeneInfoDict(genome, cache=False):
53     idb = geneinfoDB(cache=cache)
54     if genome == "dmelanogaster":
55         geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
56     else:
57         geneinfoDict = idb.getallGeneInfo(genome)
58
59     return geneinfoDict
60
61
62 def getGeneAnnotDict(genome, inRAM=False):
63     return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
64
65
66 def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
67     genome = Genome(genomeName, inRAM=inRAM)
68     if extendGenome != "":
69         genome.extendFeatures(extendGenome, replace=replaceModels)
70
71     geneannotDict = genome.allAnnotInfo()
72
73     return geneannotDict
74
75
76 def getConfigParser(fileList=[]):
77     configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
78     for filename in fileList:
79         configFiles.append(filename)
80
81     config = ConfigParser.SafeConfigParser()
82     config.read(configFiles)
83
84     return config
85
86
87 def getConfigOption(parser, section, option, default=None):
88     try:
89         setting = parser.get(section, option)
90     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
91         setting = default
92
93     return setting
94
95
96 def getConfigIntOption(parser, section, option, default=None):
97     try:
98         setting = parser.getint(section, option)
99     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
100         setting = default
101
102     return setting
103
104
105 def getConfigFloatOption(parser, section, option, default=None):
106     try:
107         setting = parser.getfloat(section, option)
108     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
109         setting = default
110
111     return setting
112
113
114 def getConfigBoolOption(parser, section, option, default=None):
115     try:
116         setting = parser.getboolean(section, option)
117     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
118         setting = default
119
120     return setting
121
122
123 def getAllConfigSectionOptions(parser, section):
124     try:
125         setting = parser.items(section)
126     except ConfigParser.NoSectionError:
127         setting = []
128
129     return setting
130
131
132 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
133                      fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
134                      doMerge=True, keepPeak=False, returnTop=0):
135
136     """ returns a dictionary containing a list of merged overlapping regions by chromosome; 
137     can optionally filter regions that have a scoreField fewer than minHits.
138     Can also optionally return the label of each region, as well as the
139     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
140     Can return the top regions based on score if higher than minHits.
141     """
142     infile = open(regionfilename)
143     lines = infile.readlines()
144     regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
145                                        fullChrom, chromField, scoreField, pad, compact,
146                                        doMerge, keepPeak, returnTop)
147
148     infile.close()
149
150     return regions
151
152
153 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
154                      fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
155                      doMerge=True, keepPeak=False, returnTop=0):
156     """ returns a dictionary containing a list of merged overlapping regions by chromosome; 
157     can optionally filter regions that have a scoreField fewer than minHits.
158     Can also optionally return the label of each region, as well as the
159     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
160     Can return the top regions based on score if higher than minHits.
161     """
162     regions = {}
163     hasPvalue = 0
164     hasShift = 0
165     if 0 < returnTop < len(regionList):
166         scores = []
167         for regionEntry in regionList:
168             if regionEntry[0] == "#":
169                 if "pvalue" in regionEntry:
170                     hasPvalue = 1
171
172                 if "readShift" in regionEntry:
173                     hasShift = 1
174
175                 continue
176
177             fields = regionEntry.strip().split("\t")
178             hits = float(fields[scoreField].strip())
179             scores.append(hits)
180
181         scores.sort()
182         returnTop = -1 * returnTop 
183         minScore = scores[returnTop]
184         if minScore > minHits:
185             minHits = minScore
186
187     mergeCount = 0
188     chromField = int(chromField)
189     count = 0
190     #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
191     #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
192     #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
193     #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
194     #      be merged with A but that will exit the loop and the merge with C will be missed.
195     #TODO: QForAli - Are we going to require sorted region files to have this even work?
196     for regionEntry in regionList:
197         if regionEntry[0] == "#":
198             if "pvalue" in regionEntry:
199                 hasPvalue = 1
200
201             if "readShift" in regionEntry:
202                 hasShift = 1
203
204             continue
205
206         fields = regionEntry.strip().split("\t")
207         if minHits >= 0:
208             try:
209                 hits = float(fields[scoreField].strip())
210             except (IndexError, ValueError):
211                 continue
212
213             if hits < minHits:
214                 continue
215
216         if compact:
217             (chrom, pos) = fields[chromField].split(":")
218             (front, back) = pos.split("-")
219             start = int(front)
220             stop = int(back)
221         elif chromField > 1:
222             label = string.join(fields[:chromField],"\t")
223             chrom = fields[chromField]
224             start = int(fields[chromField + 1]) - pad
225             stop = int(fields[chromField + 2]) + pad
226         else:
227             label = fields[0]
228             chrom = fields[1]
229             start = int(fields[2]) - pad
230             stop = int(fields[3]) + pad
231
232         if not fullChrom:
233             chrom = chrom[3:]
234
235         if keepPeak:
236             peakPos = int(fields[-2 - hasPvalue - hasShift])
237             peakHeight = float(fields[-1 - hasPvalue - hasShift])
238
239         if chrom not in regions:
240             regions[chrom] = []
241
242         merged = False
243
244         if doMerge and len(regions[chrom]) > 0:
245             for index in range(len(regions[chrom])):
246                 region = regions[chrom][index]
247                 rstart = region.start
248                 rstop = region.stop
249                 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
250                     if start < rstart:
251                         rstart = start
252
253                     if rstop < stop:
254                         rstop = stop
255
256                     if keepPeak:
257                         rpeakPos = region.peakPos
258                         rpeakHeight = region.peakHeight
259                         if peakHeight > rpeakHeight:
260                             rpeakHeight = peakHeight
261                             rpeakPos = peakPos
262
263                     regions[chrom][index].start = rstart
264                     regions[chrom][index].stop = rstop
265                     regions[chrom][index].length = abs(rstop - rstart)
266                     if keepLabel:
267                         regions[chrom][index].label = label
268
269                     if keepPeak:
270                         regions[chrom][index].peakPos = rpeakPos
271                         regions[chrom][index].peakHeight = rpeakHeight
272
273
274                     mergeCount += 1
275                     merged = True
276                     break
277
278         if not merged:
279             region = Region.Region(start, stop)
280             if keepLabel:
281                 region.label = label
282
283             if keepPeak:
284                 region.peakPos = peakPos
285                 region.peakHeight = peakHeight
286
287             regions[chrom].append(region)
288             count += 1
289
290         if verbose and (count % 100000 == 0):
291             print count
292
293     regionCount = 0
294     for chrom in regions:
295         regionCount += len(regions[chrom])
296         regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
297
298     if verbose:
299         print "merged %d times" % mergeCount
300         print "returning %d regions" % regionCount
301
302     return regions
303
304
305 def regionsOverlap(start, stop, rstart, rstop):
306     if start > stop:
307         (start, stop) = (stop, start)
308
309     if rstart > rstop:
310         (rstart, rstop) = (rstop, rstart)
311
312     return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
313
314
315 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
316     if start > stop:
317         (start, stop) = (stop, start)
318
319     if rstart > rstop:
320         (rstart, rstop) = (rstop, rstart)
321
322     return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
323
324
325 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
326              shift=0, maxshift=75):
327     """ find the peak in a list of reads (hitlist) in a region
328     of a given length and absolute start point. returns a
329     list of peaks, the number of hits, a triangular-smoothed
330     version of hitlist, and the number of reads that are
331     forward (plus) sense.
332     If doWeight is True, weight the reads accordingly.
333     If leftPlus is True, return the number of plus reads left of
334     the peak, taken to be the first TopPos position.
335     """
336
337     if shift == "auto":
338         shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
339
340     seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
341
342     # implementing a triangular smooth
343     smoothArray = array("f", [0.] * length)
344     #TODO: QForAli - really?  We're going to insert MANUFACTURED data (0.0's) and use them in the calculation
345     # (probably) just to avoid IndexError exceptions.  Then we are going to completely ignore adjusting the
346     # last 2 elements of the array for (probably) the same issue.  Is this how erange is dealing with the
347     # edge cases?
348     for pos in range(2,length -2):
349         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
350
351     topPos = getPeakPositionList(smoothArray, length)
352     peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
353
354     if leftPlus:
355         numLeftPlus = 0
356         maxPos = topPos[0]
357         for read in regionArray:
358             if doWeight:
359                 weight = read["weight"]
360             else:
361                 weight = 1.0
362
363             currentPos = read["start"] - start
364             if currentPos <= maxPos and read["sense"] == "+":
365                 numLeftPlus += weight
366
367         peak.numLeftPlus = numLeftPlus
368
369     return peak
370
371
372 def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
373     bestShift = 0
374     lowestScore = 20000000000
375     for testShift in xrange(maxShift + 1):
376         shiftArray = array("f", [0.] * length)
377         for read in readList:
378             currentpos = read["start"] - start
379             if read["sense"] == "+":
380                 senseModifier = 1
381             else:
382                 senseModifier = -1
383
384             currentpos += testShift * senseModifier
385             if (currentpos < 1) or (currentpos >= length):
386                 continue
387
388             if useWeight:
389                 weight = read["weight"]
390             else:
391                 weight = 1.0
392
393             shiftArray[currentpos] += weight * senseModifier
394
395         currentScore = 0
396         for score in shiftArray:
397             currentScore += abs(score)
398
399         if currentScore < lowestScore:
400             bestShift = testShift
401             lowestScore = currentScore
402
403     return bestShift
404
405
406 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
407     seqArray = array("f", [0.] * length)
408     numHits = 0.
409     numPlus = 0.
410     regionArray = []
411     for read in hitList:
412         currentpos = read["start"] - start
413         if read["sense"] == "+":
414             currentpos += shift
415         else:
416             currentpos -= shift
417
418         if (currentpos <  1 - readlen) or (currentpos >= length):
419             continue
420
421         if doWeight:
422             weight = read["weight"]
423         else:
424             weight = 1.0
425
426         numHits += weight
427         if leftPlus:
428             regionArray.append(read)
429
430         hitIndex = 0
431         while currentpos < 0:
432             hitIndex += 1
433             currentpos += 1
434
435         while hitIndex < readlen and currentpos < length:
436             seqArray[currentpos] += weight
437             hitIndex += 1
438             currentpos += 1
439
440         if read["sense"] == "+":
441             numPlus += weight
442
443     return seqArray, regionArray, numHits, numPlus
444
445
446 def getPeakPositionList(smoothArray, length):
447     topNucleotide = 0
448     peakList = []
449     for currentpos in xrange(length):
450         if topNucleotide < smoothArray[currentpos]:
451             topNucleotide = smoothArray[currentpos]
452             peakList = [currentpos]
453         elif topNucleotide == smoothArray[currentpos]:
454             peakList.append(currentpos)
455
456     return peakList
457
458
459 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
460                            restrictList=[], regionComplement=False, maxStop=250000000):
461     """ return a dictionary of cistematic gene features. Requires
462     cistematic, obviously. Can filter-out pseudogenes. Will use
463     additional regions dict to supplement gene models, if available.
464     Can restrict output to a list of GIDs.
465     If regionComplement is set to true, returns the regions *outside* of the
466     calculated boundaries, which is useful for retrieving intronic and
467     intergenic regions. maxStop is simply used to define the uppermost
468     boundary of the complement region.
469     """ 
470     featuresDict = genomeObject.getallGeneFeatures()
471     restrictGID = False
472     if len(restrictList) > 0:
473         restrictGID = True
474
475     if len(additionalRegionsDict) > 0:
476         sortList = []
477         for chrom in additionalRegionsDict:
478             for region in additionalRegionsDict[chrom]:
479                 label = region.label
480                 if label not in sortList:
481                     sortList.append(label)
482
483                 if label not in featuresDict:
484                     featuresDict[label] = []
485                     sense = "+"
486                 else:
487                     sense = featuresDict[label][0][-1]
488
489                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
490
491         for gid in sortList:
492             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
493
494     featuresByChromDict = {}
495     for gid in featuresDict:
496         if restrictGID and gid not in restrictList:
497             continue
498
499         newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
500         if newFeatureList and chrom not in featuresByChromDict:
501             featuresByChromDict[chrom] = []
502
503         for (start, stop, ftype) in newFeatureList:
504             featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
505
506     for chrom in featuresByChromDict:
507         featuresByChromDict[chrom].sort()
508
509     if regionComplement:
510         complementByChromDict = {}
511         complementIndex = 0
512         for chrom in featuresByChromDict:
513             complementByChromDict[chrom] = []
514             listLength = len(featuresByChromDict[chrom])
515             if listLength > 0:
516                 currentStart = 0
517                 for index in range(listLength):
518                     currentStop = featuresByChromDict[chrom][index][0]
519                     complementIndex += 1
520                     if currentStart < currentStop:
521                         complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
522
523                     currentStart = featuresByChromDict[chrom][index][1]
524
525                 currentStop = maxStop
526                 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
527
528         return (featuresByChromDict, complementByChromDict)
529     else:
530         return featuresByChromDict
531
532
533 def getNewFeatureList(featureList, ignorePseudo=False):
534     newFeatureList = []
535     for (featureType, chrom, start, stop, sense) in featureList:
536         if featureType == "PSEUDO" and ignorePseudo:
537             return [], chrom, sense
538
539         if (start, stop, featureType) not in newFeatureList:
540             notContained = True
541             containedList = []
542             for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
543                 if start >= newFeatureStart and stop <= newFeatureStop:
544                     notContained = False
545
546                 if start < newFeatureStart and stop > newFeatureStop:
547                     containedList.append((newFeatureStart, newFeatureStop))
548
549             if len(containedList) > 0:
550                 newFList = []
551                 notContained = True
552                 for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
553                     if (newFeatureStart, newFeatureStop) not in containedList:
554                         newFList.append((newFeatureStart, newFeatureStop, featureType2))
555                         if start >= newFeatureStart and stop <= newFeatureStop:
556                             notContained = False
557
558                 newFeatureList = newFList
559
560             if notContained:
561                 newFeatureList.append((start, stop, featureType))
562
563     return newFeatureList, chrom, sense
564
565
566 def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
567                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
568                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
569     """ return a dictionary of gene loci. Can be used to retrieve additional
570     sequence upstream or downstream of gene, up to the next gene. Requires
571     cistematic, obviously.
572     Can filter-out pseudogenes and use additional regions outside of existing
573     gene models. Use upstreamSpanTSS to overlap half of the upstream region
574     over the TSS.
575     If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
576     lengthCDS < 0bp, return only the last X bp from CDS.
577     """ 
578     locusByChromDict = {}
579     if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
580         return locusByChromDict
581
582     genomeName = genome.genome
583     featuresDict = getGeneFeatures(genome, additionalRegionsDict)
584     for gid in featuresDict:
585         featureList = featuresDict[gid]
586         newFeatureList = []
587         for (ftype, chrom, start, stop, sense) in featureList:
588             newFeatureList.append((start, stop))
589
590         if ignorePseudo and ftype == "PSEUDO":
591             continue
592
593         newFeatureList.sort()
594
595         sense = featureList[0][-1]
596         gstart = newFeatureList[0][0]
597         gstop = newFeatureList[-1][1]
598         glen = abs(gstart - gstop)
599         if sense == "F":
600             if not useCDS and upstream > 0:
601                 if upstreamSpanTSS:
602                     if gstop > (gstart + upstream / 2):
603                         gstop = gstart + upstream / 2
604                 else:
605                     gstop = gstart
606             elif not useCDS and downstream > 0:
607                 gstart = gstop
608
609             if upstream > 0:
610                 if upstreamSpanTSS:
611                     distance = upstream / 2
612                 else:
613                     distance = upstream
614
615                 if adjustToNeighbor:
616                     nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
617                     if nextGene < distance * 2:
618                         distance = nextGene / 2
619
620                 distance = max(distance, 1)
621                 gstart -= distance
622
623             if downstream > 0:
624                 distance = downstream
625                 if adjustToNeighbor:
626                     nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
627                     if nextGene < downstream * 2:
628                         distance = nextGene / 2
629
630                 distance = max(distance, 1)
631                 gstop += distance
632
633             if 0 < lengthCDS < glen:
634                 gstop = newFeatureList[0][0] + lengthCDS
635
636             if lengthCDS < 0 and abs(lengthCDS) < glen:
637                 gstart = newFeatureList[-1][1] + lengthCDS
638         else:
639             if not useCDS and upstream > 0:
640                 if upstreamSpanTSS:
641                     if gstart < (gstop - upstream / 2):
642                         gstart = gstop - upstream / 2
643                 else:
644                     gstart = gstop
645             elif not useCDS and downstream > 0:
646                     gstop = gstart
647
648             if upstream > 0:
649                 if upstreamSpanTSS:
650                     distance = upstream /2
651                 else:
652                     distance = upstream
653
654                 if adjustToNeighbor:
655                     nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
656                     if nextGene < distance * 2:
657                         distance = nextGene / 2
658
659                 distance = max(distance, 1)
660                 gstop += distance
661
662             if downstream > 0:
663                 distance = downstream
664                 if adjustToNeighbor:
665                     nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
666                     if nextGene < downstream * 2:
667                         distance = nextGene / 2
668
669                 distance = max(distance, 1)
670                 gstart -= distance
671
672             if 0 < lengthCDS < glen:
673                 gstart = newFeatureList[-1][-1] - lengthCDS
674
675             if lengthCDS < 0 and abs(lengthCDS) < glen:
676                 gstop = newFeatureList[0][0] - lengthCDS
677
678         if chrom not in locusByChromDict:
679             locusByChromDict[chrom] = []
680
681         if keepSense:
682             locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
683         else:
684             locusByChromDict[chrom].append((gstart, gstop, gid, glen))
685
686     for chrom in locusByChromDict:
687         locusByChromDict[chrom].sort()
688
689     return locusByChromDict
690
691
692 def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
693     if upstream == 0 and downstream == 0 and not useCDS:
694         print "getLocusByChromDict: asked for no sequence - returning empty dict"
695         return True
696     elif upstream > 0 and downstream > 0 and not useCDS:
697         print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
698         return True
699     elif lengthCDS != 0 and not useCDS:
700         print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
701         return True
702     elif upstreamSpanTSS and lengthCDS != 0:
703         print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
704         return True
705     elif lengthCDS > 0 and downstream > 0:
706         print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
707         return True
708     elif lengthCDS < 0 and upstream > 0:
709         print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
710         return True
711
712     return False
713
714
715 def getGeneFeatures(genome, additionalRegionsDict):
716     featuresDict = genome.getallGeneFeatures()
717     if len(additionalRegionsDict) > 0:
718         sortList = []
719         for chrom in additionalRegionsDict:
720             for region in additionalRegionsDict[chrom]:
721                 label = region.label
722                 if label not in sortList:
723                     sortList.append(label)
724
725                 if label not in featuresDict:
726                     featuresDict[label] = []
727                     sense = "+"
728                 else:
729                     sense = featuresDict[label][0][-1]
730
731                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
732
733         for gid in sortList:
734             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
735
736     return featuresDict
737
738
739 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
740                       normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
741                       binLength=-1):
742     """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
743         a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
744         or a different weight / tag.
745     """
746     index = 0
747     regionsBins = {}
748     regionsLen = {}
749
750     if defaultRegionFormat:
751         regionIDField = 0
752         startField = 1
753         stopField = 2
754         lengthField = 3
755     else:
756         startField = 0
757         stopField = 1
758         regionIDField = 2
759         lengthField = 3
760
761     senseField = 4
762
763     print "entering computeRegionBins"
764     if len(regionList) > 0:
765         for readID in regionList:
766             regionsBins[readID] = [0.] * bins
767     else:
768         for chrom in regionsByChromDict:
769             for regionTuple in regionsByChromDict[chrom]:
770                 regionID = regionTuple[regionIDField]
771                 regionsBins[regionID] = [0.] * bins
772
773     for chrom in hitDict:
774         if chrom not in regionsByChromDict:
775             continue
776
777         for regionTuple in regionsByChromDict[chrom]:
778             regionID = regionTuple[regionIDField]
779             regionsLen[regionID] = regionTuple[lengthField]
780
781         print "%s\n" % chrom
782         startRegion = 0
783         for read in hitDict[chrom]:
784             tagStart = read["start"]
785             weight = read["weight"]
786             index += 1
787             if index % 100000 == 0:
788                 print "read %d " % index,
789
790             stopPoint = tagStart + readlen
791             if startRegion < 0:
792                 startRegion = 0
793
794             for regionTuple in regionsByChromDict[chrom][startRegion:]:
795                 start = regionTuple[startField]
796                 stop = regionTuple[stopField]
797                 regionID = regionTuple[regionIDField]
798                 rlen = regionTuple[lengthField]
799                 try:
800                     rsense = regionTuple[senseField]
801                 except IndexError:
802                     rsense = "F"
803
804                 if tagStart > stop:
805                     startRegion += 1
806                     continue
807
808                 if start > stopPoint:
809                     startRegion -= 10
810                     break
811
812                 if start <= tagStart <= stop:
813                     if binLength < 1:
814                         regionBinLength = rlen / bins
815                     else:
816                         regionBinLength = binLength
817
818                     if rsense == "F":
819                         positionInRegion = tagStart - start
820                     else:
821                         positionInRegion = start + rlen - tagStart
822
823                     # we are relying on python's integer division quirk
824                     binID = positionInRegion / regionBinLength
825                     if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
826                         binID = 0
827                     elif fixedFirstBin > 0:
828                         binID = 1
829
830                     if binID >= bins:
831                         binID = bins - 1
832
833                     try:
834                         regionsBins[regionID][binID] += normalizedTag * weight
835                     except KeyError:
836                         print "%s %s" % (regionID, str(binID))
837
838                     stopPoint = stop
839
840     return (regionsBins, regionsLen)