commoncode.py

   1 #
   2 #  commoncode.py
   3 #  ENRAGE
   4 #
   5
   6 import ConfigParser
   7 import os
   8 import string
   9 from time import strftime
  10 from array import array
  11 from collections import defaultdict
  12 import Peak
  13 from cistematic.core.geneinfo import geneinfoDB
  14 from cistematic.genomes import Genome
  15 import Region
  16
  17 commoncodeVersion = 5.6
  18 currentRDSversion = 2.0
  19
  20 class ErangeError(Exception):
  21     pass
  22
  23
  24 def getReverseComplement(base):
  25     revComp = {"A": "T",
  26                "T": "A",
  27                "G": "C",
  28                "C": "G",
  29                "N": "N"
  30         }
  31
  32     return revComp[base]
  33
  34
  35 def countDuplicatesInList(listToCheck):
  36     tally = defaultdict(int)
  37     for item in listToCheck:
  38         tally[item] += 1
  39
  40     return tally.items()
  41
  42
  43 def writeLog(logFile, messenger, message):
  44     """ create a log file to write a message from a messenger or append to an existing file.
  45     """
  46     try:
  47         logfile = open(logFile)
  48     except IOError:
  49         logfile = open(logFile, "w")
  50     else:
  51         logfile = open(logFile, "a")
  52
  53     logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
  54     logfile.close()
  55
  56
  57 def getGeneInfoDict(genome, cache=False):
  58     idb = geneinfoDB(cache=cache)
  59     if genome == "dmelanogaster":
  60         geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
  61     else:
  62         geneinfoDict = idb.getallGeneInfo(genome)
  63
  64     return geneinfoDict
  65
  66
  67 def getGeneAnnotDict(genome, inRAM=False):
  68     return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
  69
  70
  71 def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
  72     genome = Genome(genomeName, inRAM=inRAM)
  73     if extendGenome != "":
  74         genome.extendFeatures(extendGenome, replace=replaceModels)
  75
  76     geneannotDict = genome.allAnnotInfo()
  77
  78     return geneannotDict
  79
  80
  81 def getConfigParser(fileList=[]):
  82     configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
  83     for filename in fileList:
  84         configFiles.append(filename)
  85
  86     config = ConfigParser.SafeConfigParser()
  87     config.read(configFiles)
  88
  89     return config
  90
  91
  92 def getConfigOption(parser, section, option, default=None):
  93     try:
  94         setting = parser.get(section, option)
  95     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
  96         setting = default
  97
  98     return setting
  99
 100
 101 def getConfigIntOption(parser, section, option, default=None):
 102     try:
 103         setting = parser.getint(section, option)
 104     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
 105         setting = default
 106
 107     return setting
 108
 109
 110 def getConfigFloatOption(parser, section, option, default=None):
 111     try:
 112         setting = parser.getfloat(section, option)
 113     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
 114         setting = default
 115
 116     return setting
 117
 118
 119 def getConfigBoolOption(parser, section, option, default=None):
 120     try:
 121         setting = parser.getboolean(section, option)
 122     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
 123         setting = default
 124
 125     return setting
 126
 127
 128 def getAllConfigSectionOptions(parser, section):
 129     try:
 130         setting = parser.items(section)
 131     except ConfigParser.NoSectionError:
 132         setting = []
 133
 134     return setting
 135
 136
 137 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
 138                      fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
 139                      doMerge=True, keepPeak=False, returnTop=0):
 140
 141     """ returns a dictionary containing a list of merged overlapping regions by chromosome;
 142     can optionally filter regions that have a scoreField fewer than minHits.
 143     Can also optionally return the label of each region, as well as the
 144     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
 145     Can return the top regions based on score if higher than minHits.
 146     """
 147     infile = open(regionfilename)
 148     lines = infile.readlines()
 149     regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
 150                                        fullChrom, chromField, scoreField, pad, compact,
 151                                        doMerge, keepPeak, returnTop)
 152
 153     infile.close()
 154
 155     return regions
 156
 157
 158 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
 159                      fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
 160                      doMerge=True, keepPeak=False, returnTop=0):
 161     """ returns a dictionary containing a list of merged overlapping regions by chromosome;
 162     can optionally filter regions that have a scoreField fewer than minHits.
 163     Can also optionally return the label of each region, as well as the
 164     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
 165     Can return the top regions based on score if higher than minHits.
 166     """
 167     regions = {}
 168     hasPvalue = 0
 169     hasShift = 0
 170     if 0 < returnTop < len(regionList):
 171         scores = []
 172         for regionEntry in regionList:
 173             if regionEntry[0] == "#":
 174                 if "pvalue" in regionEntry:
 175                     hasPvalue = 1
 176
 177                 if "readShift" in regionEntry:
 178                     hasShift = 1
 179
 180                 continue
 181
 182             fields = regionEntry.strip().split("\t")
 183             hits = float(fields[scoreField].strip())
 184             scores.append(hits)
 185
 186         scores.sort()
 187         returnTop = -1 * returnTop
 188         minScore = scores[returnTop]
 189         if minScore > minHits:
 190             minHits = minScore
 191
 192     mergeCount = 0
 193     chromField = int(chromField)
 194     count = 0
 195     for regionEntry in regionList:
 196         if regionEntry[0] == "#":
 197             if "pvalue" in regionEntry:
 198                 hasPvalue = 1
 199
 200             if "readShift" in regionEntry:
 201                 hasShift = 1
 202
 203             continue
 204
 205         fields = regionEntry.strip().split("\t")
 206         if minHits >= 0:
 207             try:
 208                 hits = float(fields[scoreField].strip())
 209             except (IndexError, ValueError):
 210                 continue
 211
 212             if hits < minHits:
 213                 continue
 214
 215         if compact:
 216             (chrom, pos) = fields[chromField].split(":")
 217             (front, back) = pos.split("-")
 218             start = int(front)
 219             stop = int(back)
 220         elif chromField > 1:
 221             label = string.join(fields[:chromField],"\t")
 222             chrom = fields[chromField]
 223             start = int(fields[chromField + 1]) - pad
 224             stop = int(fields[chromField + 2]) + pad
 225         else:
 226             label = fields[0]
 227             chrom = fields[1]
 228             start = int(fields[2]) - pad
 229             stop = int(fields[3]) + pad
 230
 231         if not fullChrom:
 232             chrom = chrom[3:]
 233
 234         if keepPeak:
 235             peakPos = int(fields[-2 - hasPvalue - hasShift])
 236             peakHeight = float(fields[-1 - hasPvalue - hasShift])
 237
 238         if chrom not in regions:
 239             regions[chrom] = []
 240
 241         merged = False
 242
 243         if doMerge and len(regions[chrom]) > 0:
 244             for index in range(len(regions[chrom])):
 245                 region = regions[chrom][index]
 246                 rstart = region.start
 247                 rstop = region.stop
 248                 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
 249                     if start < rstart:
 250                         rstart = start
 251
 252                     if rstop < stop:
 253                         rstop = stop
 254
 255                     if keepPeak:
 256                         rpeakPos = region.peakPos
 257                         rpeakHeight = region.peakHeight
 258                         if peakHeight > rpeakHeight:
 259                             rpeakHeight = peakHeight
 260                             rpeakPos = peakPos
 261
 262                     regions[chrom][index].start = rstart
 263                     regions[chrom][index].stop = rstop
 264                     regions[chrom][index].length = abs(rstop - rstart)
 265                     if keepLabel:
 266                         regions[chrom][index].label = label
 267
 268                     if keepPeak:
 269                         regions[chrom][index].peakPos = rpeakPos
 270                         regions[chrom][index].peakHeight = rpeakHeight
 271
 272
 273                     mergeCount += 1
 274                     merged = True
 275                     break
 276
 277         if not merged:
 278             region = Region.Region(start, stop)
 279             if keepLabel:
 280                 region.label = label
 281
 282             if keepPeak:
 283                 region.peakPos = peakPos
 284                 region.peakHeight = peakHeight
 285
 286             regions[chrom].append(region)
 287             count += 1
 288
 289         if verbose and (count % 100000 == 0):
 290             print count
 291
 292     regionCount = 0
 293     for chrom in regions:
 294         regionCount += len(regions[chrom])
 295         regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
 296
 297     if verbose:
 298         print "merged %d times" % mergeCount
 299         print "returning %d regions" % regionCount
 300
 301     return regions
 302
 303
 304 def regionsOverlap(start, stop, rstart, rstop):
 305     if start > stop:
 306         (start, stop) = (stop, start)
 307
 308     if rstart > rstop:
 309         (rstart, rstop) = (rstop, rstart)
 310
 311     return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
 312
 313
 314 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
 315     if start > stop:
 316         (start, stop) = (stop, start)
 317
 318     if rstart > rstop:
 319         (rstart, rstop) = (rstop, rstart)
 320
 321     return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
 322
 323
 324 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
 325              shift=0, maxshift=75):
 326     """ find the peak in a list of reads (hitlist) in a region
 327     of a given length and absolute start point. returns a
 328     list of peaks, the number of hits, a triangular-smoothed
 329     version of hitlist, and the number of reads that are
 330     forward (plus) sense.
 331     If doWeight is True, weight the reads accordingly.
 332     If leftPlus is True, return the number of plus reads left of
 333     the peak, taken to be the first TopPos position.
 334     """
 335
 336     if shift == "auto":
 337         shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
 338
 339     seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
 340
 341     # implementing a triangular smooth
 342     smoothArray = array("f", [0.] * length)
 343     for pos in range(2,length -2):
 344         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
 345
 346     topPos = getPeakPositionList(smoothArray, length)
 347     peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
 348
 349     if leftPlus:
 350         numLeftPlus = 0
 351         maxPos = topPos[0]
 352         for read in regionArray:
 353             if doWeight:
 354                 weight = read["weight"]
 355             else:
 356                 weight = 1.0
 357
 358             currentPos = read["start"] - start
 359             if currentPos <= maxPos and read["sense"] == "+":
 360                 numLeftPlus += weight
 361
 362         peak.numLeftPlus = numLeftPlus
 363
 364     return peak
 365
 366
 367 def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
 368     bestShift = 0
 369     lowestScore = 20000000000
 370     for testShift in xrange(maxShift + 1):
 371         shiftArray = array("f", [0.] * length)
 372         for read in readList:
 373             currentpos = read["start"] - start
 374             if read["sense"] == "+":
 375                 currentpos += testShift
 376             else:
 377                 currentpos -= testShift
 378
 379             if (currentpos < 1) or (currentpos >= length):
 380                 continue
 381
 382             if useWeight:
 383                 weight = read["weight"]
 384             else:
 385                 weight = 1.0
 386
 387             if read["sense"] == "+":
 388                 shiftArray[currentpos] += weight
 389             else:
 390                 shiftArray[currentpos] -= weight
 391
 392         currentScore = 0
 393         for score in shiftArray:
 394             currentScore += abs(score)
 395
 396         print currentScore
 397         if currentScore < lowestScore:
 398             bestShift = testShift
 399             lowestScore = currentScore
 400
 401     return bestShift
 402
 403
 404 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
 405     seqArray = array("f", [0.] * length)
 406     numHits = 0.
 407     numPlus = 0.
 408     regionArray = []
 409     for read in hitList:
 410         currentpos = read["start"] - start
 411         if read["sense"] == "+":
 412             currentpos += shift
 413         else:
 414             currentpos -= shift
 415
 416         if (currentpos <  1 - readlen) or (currentpos >= length):
 417             continue
 418
 419         if doWeight:
 420             weight = read["weight"]
 421         else:
 422             weight = 1.0
 423
 424         numHits += weight
 425         if leftPlus:
 426             regionArray.append(read)
 427
 428         hitIndex = 0
 429         while currentpos < 0:
 430             hitIndex += 1
 431             currentpos += 1
 432
 433         while hitIndex < readlen and currentpos < length:
 434             seqArray[currentpos] += weight
 435             hitIndex += 1
 436             currentpos += 1
 437
 438         if read["sense"] == "+":
 439             numPlus += weight
 440
 441     return seqArray, regionArray, numHits, numPlus
 442
 443
 444 def getPeakPositionList(smoothArray, length):
 445     topNucleotide = 0
 446     peakList = []
 447     for currentpos in xrange(length):
 448         if topNucleotide < smoothArray[currentpos]:
 449             topNucleotide = smoothArray[currentpos]
 450             peakList = [currentpos]
 451         elif topNucleotide == smoothArray[currentpos]:
 452             peakList.append(currentpos)
 453
 454     return peakList
 455
 456
 457 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
 458                            restrictList=[], regionComplement=False, maxStop=250000000):
 459     """ return a dictionary of cistematic gene features. Requires
 460     cistematic, obviously. Can filter-out pseudogenes. Will use
 461     additional regions dict to supplement gene models, if available.
 462     Can restrict output to a list of GIDs.
 463     If regionComplement is set to true, returns the regions *outside* of the
 464     calculated boundaries, which is useful for retrieving intronic and
 465     intergenic regions. maxStop is simply used to define the uppermost
 466     boundary of the complement region.
 467     """
 468     featuresDict = genomeObject.getallGeneFeatures()
 469     restrictGID = False
 470     if len(restrictList) > 0:
 471         restrictGID = True
 472
 473     if len(additionalRegionsDict) > 0:
 474         sortList = []
 475         for chrom in additionalRegionsDict:
 476             for region in additionalRegionsDict[chrom]:
 477                 label = region.label
 478                 if label not in sortList:
 479                     sortList.append(label)
 480
 481                 if label not in featuresDict:
 482                     featuresDict[label] = []
 483                     sense = "+"
 484                 else:
 485                     sense = featuresDict[label][0][-1]
 486
 487                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
 488
 489         for gid in sortList:
 490             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
 491
 492     featuresByChromDict = {}
 493     for gid in featuresDict:
 494         if restrictGID and gid not in restrictList:
 495             continue
 496
 497         featureList = featuresDict[gid]
 498         newFeatureList = []
 499         isPseudo = False
 500         for (ftype, chrom, start, stop, sense) in featureList:
 501             if ftype == "PSEUDO":
 502                 isPseudo = True
 503
 504             if (start, stop, ftype) not in newFeatureList:
 505                 notContained = True
 506                 containedList = []
 507                 for (fstart, fstop, ftype2) in newFeatureList:
 508                     if start >= fstart and stop <= fstop:
 509                         notContained = False
 510
 511                     if start < fstart and stop > fstop:
 512                         containedList.append((fstart, fstop))
 513
 514                 if len(containedList) > 0:
 515                     newFList = []
 516                     notContained = True
 517                     for (fstart, fstop, ftype2) in newFeatureList:
 518                         if (fstart, fstop) not in containedList:
 519                             newFList.append((fstart, fstop, ftype2))
 520                             if start >= fstart and stop <= fstop:
 521                                 notContained = False
 522
 523                     newFeatureList = newFList
 524                 if notContained:
 525                     newFeatureList.append((start, stop, ftype))
 526
 527         if ignorePseudo and isPseudo:
 528             continue
 529
 530         if chrom not in featuresByChromDict:
 531             featuresByChromDict[chrom] = []
 532
 533         for (start, stop, ftype) in newFeatureList:
 534             featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
 535
 536     for chrom in featuresByChromDict:
 537         featuresByChromDict[chrom].sort()
 538
 539     if regionComplement:
 540         complementByChromDict = {}
 541         complementIndex = 0
 542         for chrom in featuresByChromDict:
 543             complementByChromDict[chrom] = []
 544             listLength = len(featuresByChromDict[chrom])
 545             if listLength > 0:
 546                 currentStart = 0
 547                 for index in range(listLength):
 548                     currentStop = featuresByChromDict[chrom][index][0]
 549                     complementIndex += 1
 550                     if currentStart < currentStop:
 551                         complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
 552
 553                     currentStart = featuresByChromDict[chrom][index][1]
 554
 555                 currentStop = maxStop
 556                 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
 557
 558         return (featuresByChromDict, complementByChromDict)
 559     else:
 560         return featuresByChromDict
 561
 562
 563 def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
 564                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
 565                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
 566     """ return a dictionary of gene loci. Can be used to retrieve additional
 567     sequence upstream or downstream of gene, up to the next gene. Requires
 568     cistematic, obviously.
 569     Can filter-out pseudogenes and use additional regions outside of existing
 570     gene models. Use upstreamSpanTSS to overlap half of the upstream region
 571     over the TSS.
 572     If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
 573     lengthCDS < 0bp, return only the last X bp from CDS.
 574     """
 575     locusByChromDict = {}
 576     if upstream == 0 and downstream == 0 and not useCDS:
 577         print "getLocusByChromDict: asked for no sequence - returning empty dict"
 578         return locusByChromDict
 579     elif upstream > 0 and downstream > 0 and not useCDS:
 580         print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
 581         return locusByChromDict
 582     elif lengthCDS != 0 and not useCDS:
 583         print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
 584         return locusByChromDict
 585     elif upstreamSpanTSS and lengthCDS != 0:
 586         print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
 587         return locusByChromDict
 588     elif lengthCDS > 0 and downstream > 0:
 589         print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
 590         return locusByChromDict
 591     elif lengthCDS < 0 and upstream > 0:
 592         print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
 593         return locusByChromDict
 594
 595     genomeName = genome.genome
 596     featuresDict = getGeneFeatures(genome, additionalRegionsDict)
 597     for gid in featuresDict:
 598         featureList = featuresDict[gid]
 599         newFeatureList = []
 600         for (ftype, chrom, start, stop, sense) in featureList:
 601             newFeatureList.append((start, stop))
 602
 603         if ignorePseudo and ftype == "PSEUDO":
 604             continue
 605
 606         newFeatureList.sort()
 607
 608         sense = featureList[0][-1]
 609         gstart = newFeatureList[0][0]
 610         gstop = newFeatureList[-1][1]
 611         glen = abs(gstart - gstop)
 612         if sense == "F":
 613             if not useCDS and upstream > 0:
 614                 if upstreamSpanTSS:
 615                     if gstop > (gstart + upstream / 2):
 616                         gstop = gstart + upstream / 2
 617                 else:
 618                     gstop = gstart
 619             elif not useCDS and downstream > 0:
 620                 gstart = gstop
 621
 622             if upstream > 0:
 623                 if upstreamSpanTSS:
 624                     distance = upstream / 2
 625                 else:
 626                     distance = upstream
 627
 628                 if adjustToNeighbor:
 629                     nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
 630                     if nextGene < distance * 2:
 631                         distance = nextGene / 2
 632
 633                 distance = max(distance, 1)
 634                 gstart -= distance
 635
 636             if downstream > 0:
 637                 distance = downstream
 638                 if adjustToNeighbor:
 639                     nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
 640                     if nextGene < downstream * 2:
 641                         distance = nextGene / 2
 642
 643                 distance = max(distance, 1)
 644                 gstop += distance
 645
 646             if 0 < lengthCDS < glen:
 647                 gstop = newFeatureList[0][0] + lengthCDS
 648
 649             if lengthCDS < 0 and abs(lengthCDS) < glen:
 650                 gstart = newFeatureList[-1][1] + lengthCDS
 651         else:
 652             if not useCDS and upstream > 0:
 653                 if upstreamSpanTSS:
 654                     if gstart < (gstop - upstream / 2):
 655                         gstart = gstop - upstream / 2
 656                 else:
 657                     gstart = gstop
 658             elif not useCDS and downstream > 0:
 659                     gstop = gstart
 660
 661             if upstream > 0:
 662                 if upstreamSpanTSS:
 663                     distance = upstream /2
 664                 else:
 665                     distance = upstream
 666
 667                 if adjustToNeighbor:
 668                     nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
 669                     if nextGene < distance * 2:
 670                         distance = nextGene / 2
 671
 672                 distance = max(distance, 1)
 673                 gstop += distance
 674
 675             if downstream > 0:
 676                 distance = downstream
 677                 if adjustToNeighbor:
 678                     nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
 679                     if nextGene < downstream * 2:
 680                         distance = nextGene / 2
 681
 682                 distance = max(distance, 1)
 683                 gstart -= distance
 684
 685             if 0 < lengthCDS < glen:
 686                 gstart = newFeatureList[-1][-1] - lengthCDS
 687
 688             if lengthCDS < 0 and abs(lengthCDS) < glen:
 689                 gstop = newFeatureList[0][0] - lengthCDS
 690
 691         if chrom not in locusByChromDict:
 692             locusByChromDict[chrom] = []
 693
 694         if keepSense:
 695             locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
 696         else:
 697             locusByChromDict[chrom].append((gstart, gstop, gid, glen))
 698
 699     for chrom in locusByChromDict:
 700         locusByChromDict[chrom].sort()
 701
 702     return locusByChromDict
 703
 704
 705 def getGeneFeatures(genome, additionalRegionsDict):
 706     featuresDict = genome.getallGeneFeatures()
 707     if len(additionalRegionsDict) > 0:
 708         sortList = []
 709         for chrom in additionalRegionsDict:
 710             for region in additionalRegionsDict[chrom]:
 711                 label = region.label
 712                 if label not in sortList:
 713                     sortList.append(label)
 714
 715                 if label not in featuresDict:
 716                     featuresDict[label] = []
 717                     sense = "+"
 718                 else:
 719                     sense = featuresDict[label][0][-1]
 720
 721                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
 722
 723         for gid in sortList:
 724             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
 725
 726     return featuresDict
 727
 728
 729 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
 730                       normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
 731                       binLength=-1):
 732     """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
 733         a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
 734         or a different weight / tag.
 735     """
 736     index = 0
 737     regionsBins = {}
 738     regionsLen = {}
 739
 740     if defaultRegionFormat:
 741         regionIDField = 0
 742         startField = 1
 743         stopField = 2
 744         lengthField = 3
 745     else:
 746         startField = 0
 747         stopField = 1
 748         regionIDField = 2
 749         lengthField = 3
 750
 751     senseField = 4
 752
 753     print "entering computeRegionBins"
 754     if len(regionList) > 0:
 755         for readID in regionList:
 756             regionsBins[readID] = [0.] * bins
 757     else:
 758         for chrom in regionsByChromDict:
 759             for regionTuple in regionsByChromDict[chrom]:
 760                 regionID = regionTuple[regionIDField]
 761                 regionsBins[regionID] = [0.] * bins
 762
 763     for chrom in hitDict:
 764         if chrom not in regionsByChromDict:
 765             continue
 766
 767         for regionTuple in regionsByChromDict[chrom]:
 768             regionID = regionTuple[regionIDField]
 769             regionsLen[regionID] = regionTuple[lengthField]
 770
 771         print "%s\n" % chrom
 772         startRegion = 0
 773         for read in hitDict[chrom]:
 774             tagStart = read["start"]
 775             weight = read["weight"]
 776             index += 1
 777             if index % 100000 == 0:
 778                 print "read %d " % index,
 779
 780             stopPoint = tagStart + readlen
 781             if startRegion < 0:
 782                 startRegion = 0
 783
 784             for regionTuple in regionsByChromDict[chrom][startRegion:]:
 785                 start = regionTuple[startField]
 786                 stop = regionTuple[stopField]
 787                 regionID = regionTuple[regionIDField]
 788                 rlen = regionTuple[lengthField]
 789                 try:
 790                     rsense = regionTuple[senseField]
 791                 except IndexError:
 792                     rsense = "F"
 793
 794                 if tagStart > stop:
 795                     startRegion += 1
 796                     continue
 797
 798                 if start > stopPoint:
 799                     startRegion -= 10
 800                     break
 801
 802                 if start <= tagStart <= stop:
 803                     if binLength < 1:
 804                         regionBinLength = rlen / bins
 805                     else:
 806                         regionBinLength = binLength
 807
 808                     startdist = tagStart - start
 809                     if rsense == "F":
 810                         # we are relying on python's integer division quirk
 811                         binID = startdist / regionBinLength
 812                         if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
 813                             binID = 0
 814                         elif fixedFirstBin > 0:
 815                             binID = 1
 816
 817                         if binID >= bins:
 818                             binID = bins - 1
 819
 820                         try:
 821                             regionsBins[regionID][binID] += normalizedTag * weight
 822                         except KeyError:
 823                             print "%s %s" % (regionID, str(binID))
 824                     else:
 825                         rdist = rlen - startdist
 826                         binID = rdist / regionBinLength
 827                         if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
 828                             binID = 0
 829                         elif fixedFirstBin > 0:
 830                             binID = 1
 831
 832                         if binID >= bins:
 833                             binID = bins - 1
 834
 835                         try:
 836                             regionsBins[regionID][binID] += normalizedTag * weight
 837                         except KeyError:
 838                             print "%s %s" % (regionID, str(binID))
 839
 840                     stopPoint = stop
 841
 842     return (regionsBins, regionsLen)