commoncode.py

   1 #
   2 #  commoncode.py
   3 #  ENRAGE
   4 #
   5
   6 import ConfigParser
   7 import os
   8 import string
   9 from time import strftime
  10 from array import array
  11 from collections import defaultdict
  12 import Peak
  13 from cistematic.core.geneinfo import geneinfoDB
  14 from cistematic.genomes import Genome
  15 import Region
  16
  17 commoncodeVersion = 5.6
  18 currentRDSversion = 2.0
  19
  20
  21 def getReverseComplement(base):
  22     revComp = {"A": "T",
  23                "T": "A",
  24                "G": "C",
  25                "C": "G",
  26                "N": "N"
  27         }
  28
  29     return revComp[base]
  30
  31
  32 def countDuplicatesInList(listToCheck):
  33     tally = defaultdict(int)
  34     for item in listToCheck:
  35         tally[item] += 1
  36
  37     return tally.items()
  38
  39
  40 def writeLog(logFile, messenger, message):
  41     """ create a log file to write a message from a messenger or append to an existing file.
  42     """
  43     try:
  44         logfile = open(logFile)
  45     except IOError:
  46         logfile = open(logFile, "w")
  47     else:
  48         logfile = open(logFile, "a")
  49
  50     logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
  51     logfile.close()
  52
  53
  54 def getGeneInfoDict(genome, cache=False):
  55     idb = geneinfoDB(cache=cache)
  56     if genome == "dmelanogaster":
  57         geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
  58     else:
  59         geneinfoDict = idb.getallGeneInfo(genome)
  60
  61     return geneinfoDict
  62
  63
  64 def getGeneAnnotDict(genome, inRAM=False):
  65     return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
  66
  67
  68 def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False):
  69     hg = Genome(genome, inRAM=inRAM)
  70     if extendGenome != "":
  71         hg.extendFeatures(extendGenome, replace=replaceModels)
  72
  73     geneannotDict = hg.allAnnotInfo()
  74
  75     return geneannotDict
  76
  77
  78 def getConfigParser(fileList=[]):
  79     configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
  80     for filename in fileList:
  81         configFiles.append(filename)
  82
  83     config = ConfigParser.SafeConfigParser()
  84     config.read(configFiles)
  85
  86     return config
  87
  88
  89 def getConfigOption(parser, section, option, default=None):
  90     try:
  91         setting = parser.get(section, option)
  92     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
  93         setting = default
  94
  95     return setting
  96
  97
  98 def getConfigIntOption(parser, section, option, default=None):
  99     try:
 100         setting = parser.getint(section, option)
 101     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
 102         setting = default
 103
 104     return setting
 105
 106
 107 def getConfigFloatOption(parser, section, option, default=None):
 108     try:
 109         setting = parser.getfloat(section, option)
 110     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
 111         setting = default
 112
 113     return setting
 114
 115
 116 def getConfigBoolOption(parser, section, option, default=None):
 117     try:
 118         setting = parser.getboolean(section, option)
 119     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
 120         setting = default
 121
 122     return setting
 123
 124
 125 def getAllConfigSectionOptions(parser, section):
 126     try:
 127         setting = parser.items(section)
 128     except ConfigParser.NoSectionError:
 129         setting = []
 130
 131     return setting
 132
 133
 134 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
 135                      fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
 136                      doMerge=True, keepPeak=False, returnTop=0):
 137
 138     """ returns a dictionary containing a list of merged overlapping regions by chromosome;
 139     can optionally filter regions that have a scoreField fewer than minHits.
 140     Can also optionally return the label of each region, as well as the
 141     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
 142     Can return the top regions based on score if higher than minHits.
 143     """
 144     infile = open(regionfilename)
 145     lines = infile.readlines()
 146     regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
 147                                        fullChrom, chromField, scoreField, pad, compact,
 148                                        doMerge, keepPeak, returnTop)
 149
 150     infile.close()
 151
 152     return regions
 153
 154
 155 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
 156                      fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
 157                      doMerge=True, keepPeak=False, returnTop=0):
 158     """ returns a dictionary containing a list of merged overlapping regions by chromosome;
 159     can optionally filter regions that have a scoreField fewer than minHits.
 160     Can also optionally return the label of each region, as well as the
 161     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
 162     Can return the top regions based on score if higher than minHits.
 163     """
 164     regions = {}
 165     hasPvalue = 0
 166     hasShift = 0
 167     if 0 < returnTop < len(regionList):
 168         scores = []
 169         for regionEntry in regionList:
 170             if regionEntry[0] == "#":
 171                 if "pvalue" in regionEntry:
 172                     hasPvalue = 1
 173
 174                 if "readShift" in regionEntry:
 175                     hasShift = 1
 176
 177                 continue
 178
 179             fields = regionEntry.strip().split("\t")
 180             hits = float(fields[scoreField].strip())
 181             scores.append(hits)
 182
 183         scores.sort()
 184         returnTop = -1 * returnTop
 185         minScore = scores[returnTop]
 186         if minScore > minHits:
 187             minHits = minScore
 188
 189     mergeCount = 0
 190     chromField = int(chromField)
 191     count = 0
 192     #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
 193     #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
 194     #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
 195     #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
 196     #      be merged with A but that will exit the loop and the merge with C will be missed.
 197     for regionEntry in regionList:
 198         if regionEntry[0] == "#":
 199             if "pvalue" in regionEntry:
 200                 hasPvalue = 1
 201
 202             if "readShift" in regionEntry:
 203                 hasShift = 1
 204
 205             continue
 206
 207         fields = regionEntry.strip().split("\t")
 208         if minHits >= 0:
 209             try:
 210                 hits = float(fields[scoreField].strip())
 211             except (IndexError, ValueError):
 212                 continue
 213
 214             if hits < minHits:
 215                 continue
 216
 217         if compact:
 218             (chrom, pos) = fields[chromField].split(":")
 219             (front, back) = pos.split("-")
 220             start = int(front)
 221             stop = int(back)
 222         elif chromField > 1:
 223             label = string.join(fields[:chromField],"\t")
 224             chrom = fields[chromField]
 225             start = int(fields[chromField + 1]) - pad
 226             stop = int(fields[chromField + 2]) + pad
 227         else:
 228             label = fields[0]
 229             chrom = fields[1]
 230             start = int(fields[2]) - pad
 231             stop = int(fields[3]) + pad
 232
 233         if not fullChrom:
 234             chrom = chrom[3:]
 235
 236         if keepPeak:
 237             peakPos = int(fields[-2 - hasPvalue - hasShift])
 238             peakHeight = float(fields[-1 - hasPvalue - hasShift])
 239
 240         if chrom not in regions:
 241             regions[chrom] = []
 242
 243         merged = False
 244
 245         if doMerge and len(regions[chrom]) > 0:
 246             for index in range(len(regions[chrom])):
 247                 region = regions[chrom][index]
 248                 rstart = region.start
 249                 rstop = region.stop
 250                 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
 251                     if start < rstart:
 252                         rstart = start
 253
 254                     if rstop < stop:
 255                         rstop = stop
 256
 257                     if keepPeak:
 258                         rpeakPos = region.peakPos
 259                         rpeakHeight = region.peakHeight
 260                         if peakHeight > rpeakHeight:
 261                             rpeakHeight = peakHeight
 262                             rpeakPos = peakPos
 263
 264                     regions[chrom][index].start = rstart
 265                     regions[chrom][index].stop = rstop
 266                     regions[chrom][index].length = abs(rstop - rstart)
 267                     if keepLabel:
 268                         regions[chrom][index].label = label
 269
 270                     if keepPeak:
 271                         regions[chrom][index].peakPos = rpeakPos
 272                         regions[chrom][index].peakHeight = rpeakHeight
 273
 274
 275                     mergeCount += 1
 276                     merged = True
 277                     break
 278
 279         if not merged:
 280             region = Region.Region(start, stop)
 281             if keepLabel:
 282                 region.label = label
 283
 284             if keepPeak:
 285                 region.peakPos = peakPos
 286                 region.peakHeight = peakHeight
 287
 288             regions[chrom].append(region)
 289             count += 1
 290
 291         if verbose and (count % 100000 == 0):
 292             print count
 293
 294     regionCount = 0
 295     for chrom in regions:
 296         regionCount += len(regions[chrom])
 297         regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
 298
 299     if verbose:
 300         print "merged %d times" % mergeCount
 301         print "returning %d regions" % regionCount
 302
 303     return regions
 304
 305
 306 def regionsOverlap(start, stop, rstart, rstop):
 307     if start > stop:
 308         (start, stop) = (stop, start)
 309
 310     if rstart > rstop:
 311         (rstart, rstop) = (rstop, rstart)
 312
 313     return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
 314
 315
 316 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
 317     if start > stop:
 318         (start, stop) = (stop, start)
 319
 320     if rstart > rstop:
 321         (rstart, rstop) = (rstop, rstart)
 322
 323     return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
 324
 325
 326 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
 327              shift=0, maxshift=75):
 328     """ find the peak in a list of reads (hitlist) in a region
 329     of a given length and absolute start point. returns a
 330     list of peaks, the number of hits, a triangular-smoothed
 331     version of hitlist, and the number of reads that are
 332     forward (plus) sense.
 333     If doWeight is True, weight the reads accordingly.
 334     If leftPlus is True, return the number of plus reads left of
 335     the peak, taken to be the first TopPos position.
 336     """
 337
 338     if shift == "auto":
 339         shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift)
 340
 341     seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
 342
 343     # implementing a triangular smooth
 344     smoothArray = array("f", [0.] * length)
 345     for pos in range(2,length -2):
 346         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
 347
 348     topPos = getPeakPositionList(smoothArray, length)
 349     peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
 350
 351     if leftPlus:
 352         numLeftPlus = 0
 353         maxPos = topPos[0]
 354         for read in regionArray:
 355             if doWeight:
 356                 weight = read["weight"]
 357             else:
 358                 weight = 1.0
 359
 360             currentPos = read["start"] - start
 361             if currentPos <= maxPos and read["sense"] == "+":
 362                 numLeftPlus += weight
 363
 364         peak.numLeftPlus = numLeftPlus
 365
 366     return peak
 367
 368
 369 def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75):
 370     bestShift = 0
 371     lowestScore = 20000000000
 372     for testShift in xrange(maxShift + 1):
 373         shiftArray = array("f", [0.] * length)
 374         for read in hitList:
 375             currentpos = read["start"] - start
 376             if read["sense"] == "+":
 377                 currentpos += testShift
 378             else:
 379                 currentpos -= testShift
 380
 381             if (currentpos < 1) or (currentpos >= length):
 382                 continue
 383
 384             if doWeight:
 385                 weight = read["weight"]
 386             else:
 387                 weight = 1.0
 388
 389             if read["sense"] == "+":
 390                 shiftArray[currentpos] += weight
 391             else:
 392                 shiftArray[currentpos] -= weight
 393
 394         currentScore = 0
 395         for score in shiftArray:
 396             currentScore += abs(score)
 397
 398         print currentScore
 399         if currentScore < lowestScore:
 400             bestShift = testShift
 401             lowestScore = currentScore
 402
 403     return bestShift
 404
 405
 406 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
 407     seqArray = array("f", [0.] * length)
 408     numHits = 0.
 409     numPlus = 0.
 410     regionArray = []
 411     for read in hitList:
 412         currentpos = read["start"] - start
 413         if read["sense"] == "+":
 414             currentpos += shift
 415         else:
 416             currentpos -= shift
 417
 418         if (currentpos <  1 - readlen) or (currentpos >= length):
 419             continue
 420
 421         if doWeight:
 422             weight = read["weight"]
 423         else:
 424             weight = 1.0
 425
 426         numHits += weight
 427         if leftPlus:
 428             regionArray.append(read)
 429
 430         hitIndex = 0
 431         while currentpos < 0:
 432             hitIndex += 1
 433             currentpos += 1
 434
 435         while hitIndex < readlen and  currentpos < length:
 436             seqArray[currentpos] += weight
 437             hitIndex += 1
 438             currentpos += 1
 439
 440         if read["sense"] == "+":
 441             numPlus += weight
 442
 443     return seqArray, regionArray, numHits, numPlus
 444
 445
 446 def getPeakPositionList(smoothArray, length):
 447     topNucleotide = 0
 448     peakList = []
 449     for currentpos in xrange(length):
 450         if topNucleotide < smoothArray[currentpos]:
 451             topNucleotide = smoothArray[currentpos]
 452             peakList = [currentpos]
 453         elif topNucleotide  == smoothArray[currentpos]:
 454             peakList.append(currentpos)
 455
 456     return peakList
 457
 458
 459 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
 460                            restrictList=[], regionComplement=False, maxStop=250000000):
 461     """ return a dictionary of cistematic gene features. Requires
 462     cistematic, obviously. Can filter-out pseudogenes. Will use
 463     additional regions dict to supplement gene models, if available.
 464     Can restrict output to a list of GIDs.
 465     If regionComplement is set to true, returns the regions *outside* of the
 466     calculated boundaries, which is useful for retrieving intronic and
 467     intergenic regions. maxStop is simply used to define the uppermost
 468     boundary of the complement region.
 469     """
 470     featuresDict = genomeObject.getallGeneFeatures()
 471     restrictGID = False
 472     if len(restrictList) > 0:
 473         restrictGID = True
 474
 475     if len(additionalRegionsDict) > 0:
 476         sortList = []
 477         for chrom in additionalRegionsDict:
 478             for region in additionalRegionsDict[chrom]:
 479                 label = region.label
 480                 if label not in sortList:
 481                     sortList.append(label)
 482
 483                 if label not in featuresDict:
 484                     featuresDict[label] = []
 485                     sense = "+"
 486                 else:
 487                     sense = featuresDict[label][0][-1]
 488
 489                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
 490
 491         for gid in sortList:
 492             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
 493
 494     featuresByChromDict = {}
 495     for gid in featuresDict:
 496         if restrictGID and gid not in restrictList:
 497             continue
 498
 499         featureList = featuresDict[gid]
 500         newFeatureList = []
 501         isPseudo = False
 502         for (ftype, chrom, start, stop, sense) in featureList:
 503             if ftype == "PSEUDO":
 504                 isPseudo = True
 505
 506             if (start, stop, ftype) not in newFeatureList:
 507                 notContained = True
 508                 containedList = []
 509                 for (fstart, fstop, ftype2) in newFeatureList:
 510                     if start >= fstart and stop <= fstop:
 511                         notContained = False
 512
 513                     if start < fstart and stop > fstop:
 514                         containedList.append((fstart, fstop))
 515
 516                 if len(containedList) > 0:
 517                     newFList = []
 518                     notContained = True
 519                     for (fstart, fstop, ftype2) in newFeatureList:
 520                         if (fstart, fstop) not in containedList:
 521                             newFList.append((fstart, fstop, ftype2))
 522                             if start >= fstart and stop <= fstop:
 523                                 notContained = False
 524
 525                     newFeatureList = newFList
 526                 if notContained:
 527                     newFeatureList.append((start, stop, ftype))
 528
 529         if ignorePseudo and isPseudo:
 530             continue
 531
 532         if chrom not in featuresByChromDict:
 533             featuresByChromDict[chrom] = []
 534
 535         for (start, stop, ftype) in newFeatureList:
 536             featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
 537
 538     for chrom in featuresByChromDict:
 539         featuresByChromDict[chrom].sort()
 540
 541     if regionComplement:
 542         complementByChromDict = {}
 543         complementIndex = 0
 544         for chrom in featuresByChromDict:
 545             complementByChromDict[chrom] = []
 546             listLength = len(featuresByChromDict[chrom])
 547             if listLength > 0:
 548                 currentStart = 0
 549                 for index in range(listLength):
 550                     currentStop = featuresByChromDict[chrom][index][0]
 551                     complementIndex += 1
 552                     if currentStart < currentStop:
 553                         complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
 554
 555                     currentStart = featuresByChromDict[chrom][index][1]
 556
 557                 currentStop = maxStop
 558                 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
 559
 560         return (featuresByChromDict, complementByChromDict)
 561     else:
 562         return featuresByChromDict
 563
 564
 565 def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True,
 566                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
 567                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
 568     """ return a dictionary of gene loci. Can be used to retrieve additional
 569     sequence upstream or downstream of gene, up to the next gene. Requires
 570     cistematic, obviously.
 571     Can filter-out pseudogenes and use additional regions outside of existing
 572     gene models. Use upstreamSpanTSS to overlap half of the upstream region
 573     over the TSS.
 574     If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
 575     lengthCDS < 0bp, return only the last X bp from CDS.
 576     """
 577     locusByChromDict = {}
 578     if upstream == 0 and downstream == 0 and not useCDS:
 579         print "getLocusByChromDict: asked for no sequence - returning empty dict"
 580         return locusByChromDict
 581     elif upstream > 0 and downstream > 0 and not useCDS:
 582         print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
 583         return locusByChromDict
 584     elif lengthCDS != 0 and not useCDS:
 585         print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
 586         return locusByChromDict
 587     elif upstreamSpanTSS and lengthCDS != 0:
 588         print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
 589         return locusByChromDict
 590     elif lengthCDS > 0 and downstream > 0:
 591         print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
 592         return locusByChromDict
 593     elif lengthCDS < 0 and upstream > 0:
 594         print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
 595         return locusByChromDict
 596
 597     genome = genomeObject.genome
 598     featuresDict = genomeObject.getallGeneFeatures()
 599     if len(additionalRegionsDict) > 0:
 600         sortList = []
 601         for chrom in additionalRegionsDict:
 602             for region in additionalRegionsDict[chrom]:
 603                 label = region.label
 604                 if label not in sortList:
 605                     sortList.append(label)
 606
 607                 if label not in featuresDict:
 608                     featuresDict[label] = []
 609                     sense = "+"
 610                 else:
 611                     sense = featuresDict[label][0][-1]
 612
 613                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
 614
 615         for gid in sortList:
 616             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
 617
 618     for gid in featuresDict:
 619         featureList = featuresDict[gid]
 620         newFeatureList = []
 621         for (ftype, chrom, start, stop, sense) in featureList:
 622             newFeatureList.append((start, stop))
 623
 624         if ignorePseudo and ftype == "PSEUDO":
 625             continue
 626
 627         newFeatureList.sort()
 628
 629         sense = featureList[0][-1]
 630         gstart = newFeatureList[0][0]
 631         gstop = newFeatureList[-1][1]
 632         glen = abs(gstart - gstop)
 633         if sense == "F":
 634             if not useCDS and upstream > 0:
 635                 if upstreamSpanTSS:
 636                     if gstop > (gstart + upstream / 2):
 637                         gstop = gstart + upstream / 2
 638                 else:
 639                     gstop = gstart
 640             elif not useCDS and downstream > 0:
 641                 gstart = gstop
 642
 643             if upstream > 0:
 644                 if upstreamSpanTSS:
 645                     distance = upstream / 2
 646                 else:
 647                     distance = upstream
 648
 649                 if adjustToNeighbor:
 650                     nextGene = genomeObject.leftGeneDistance((genome, gid), distance * 2)
 651                     if nextGene < distance * 2:
 652                         distance = nextGene / 2
 653
 654                 if distance < 1:
 655                     distance = 1
 656
 657                 gstart -= distance
 658
 659             if downstream > 0:
 660                 distance = downstream
 661                 if adjustToNeighbor:
 662                     nextGene = genomeObject.rightGeneDistance((genome, gid), downstream * 2)
 663                     if nextGene < downstream * 2:
 664                         distance = nextGene / 2
 665
 666                 if distance < 1:
 667                     distance = 1
 668
 669                 gstop += distance
 670
 671             if lengthCDS > 0:
 672                 if lengthCDS < glen:
 673                     gstop = newFeatureList[0][0] + lengthCDS
 674
 675             if lengthCDS < 0:
 676                 if abs(lengthCDS) < glen:
 677                     gstart = newFeatureList[-1][1] + lengthCDS
 678         else:
 679             if not useCDS and upstream > 0:
 680                 if upstreamSpanTSS:
 681                     if gstart < (gstop - upstream / 2):
 682                         gstart = gstop - upstream / 2
 683                 else:
 684                     gstart = gstop
 685             elif not useCDS and downstream > 0:
 686                     gstop = gstart
 687
 688             if upstream > 0:
 689                 if upstreamSpanTSS:
 690                     distance = upstream /2
 691                 else:
 692                     distance = upstream
 693
 694                 if adjustToNeighbor:
 695                     nextGene = genomeObject.rightGeneDistance((genome, gid), distance * 2)
 696                     if nextGene < distance * 2:
 697                         distance = nextGene / 2
 698
 699                 if distance < 1:
 700                     distance = 1
 701
 702                 gstop += distance
 703
 704             if downstream > 0:
 705                 distance = downstream
 706                 if adjustToNeighbor:
 707                     nextGene = genomeObject.leftGeneDistance((genome, gid), downstream * 2)
 708                     if nextGene < downstream * 2:
 709                         distance = nextGene / 2
 710
 711                 if distance < 1:
 712                     distance = 1
 713
 714                 gstart -= distance
 715
 716             if lengthCDS > 0:
 717                 if lengthCDS < glen:
 718                     gstart = newFeatureList[-1][-1] - lengthCDS
 719
 720             if lengthCDS < 0:
 721                 if abs(lengthCDS) < glen:
 722                     gstop = newFeatureList[0][0] - lengthCDS
 723
 724         glen = abs(gstop - gstart)
 725         if chrom not in locusByChromDict:
 726             locusByChromDict[chrom] = []
 727
 728         if keepSense:
 729             locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
 730         else:
 731             locusByChromDict[chrom].append((gstart, gstop, gid, glen))
 732
 733     for chrom in locusByChromDict:
 734         locusByChromDict[chrom].sort()
 735
 736     return locusByChromDict
 737
 738
 739 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
 740                       normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
 741                       binLength=-1):
 742     """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
 743         a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
 744         or a different weight / tag.
 745     """
 746     index = 0
 747     regionsBins = {}
 748     regionsLen = {}
 749
 750     if defaultRegionFormat:
 751         regionIDField = 0
 752         startField = 1
 753         stopField = 2
 754         lengthField = 3
 755     else:
 756         startField = 0
 757         stopField = 1
 758         regionIDField = 2
 759         lengthField = 3
 760
 761     senseField = 4
 762
 763     print "entering computeRegionBins"
 764     if len(regionList) > 0:
 765         for readID in regionList:
 766             regionsBins[readID] = [0.] * bins
 767     else:
 768         for chrom in regionsByChromDict:
 769             for regionTuple in regionsByChromDict[chrom]:
 770                 regionID = regionTuple[regionIDField]
 771                 regionsBins[regionID] = [0.] * bins
 772
 773     for chrom in hitDict:
 774         if chrom not in regionsByChromDict:
 775             continue
 776
 777         for regionTuple in regionsByChromDict[chrom]:
 778             regionID = regionTuple[regionIDField]
 779             regionsLen[regionID] = regionTuple[lengthField]
 780
 781         print "%s\n" % chrom
 782         startRegion = 0
 783         for read in hitDict[chrom]:
 784             tagStart = read["start"]
 785             weight = read["weight"]
 786             index += 1
 787             if index % 100000 == 0:
 788                 print "read %d " % index,
 789
 790             stopPoint = tagStart + readlen
 791             if startRegion < 0:
 792                 startRegion = 0
 793
 794             for regionTuple in regionsByChromDict[chrom][startRegion:]:
 795                 start = regionTuple[startField]
 796                 stop = regionTuple[stopField]
 797                 regionID = regionTuple[regionIDField]
 798                 rlen = regionTuple[lengthField]
 799                 try:
 800                     rsense = regionTuple[senseField]
 801                 except:
 802                     rsense = "F"
 803
 804                 if tagStart > stop:
 805                     startRegion += 1
 806                     continue
 807
 808                 if start > stopPoint:
 809                     startRegion -= 10
 810                     break
 811
 812                 if start <= tagStart <= stop:
 813                     if binLength < 1:
 814                         regionBinLength = rlen / bins
 815                     else:
 816                         regionBinLength = binLength
 817
 818                     startdist = tagStart - start
 819                     if rsense == "F":
 820                         # we are relying on python's integer division quirk
 821                         binID = startdist / regionBinLength
 822                         if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
 823                             binID = 0
 824                         elif fixedFirstBin > 0:
 825                             binID = 1
 826
 827                         if binID >= bins:
 828                             binID = bins - 1
 829
 830                         try:
 831                             regionsBins[regionID][binID] += normalizedTag * weight
 832                         except KeyError:
 833                             print "%s %s" % (regionID, str(binID))
 834                     else:
 835                         rdist = rlen - startdist
 836                         binID = rdist / regionBinLength
 837                         if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
 838                             binID = 0
 839                         elif fixedFirstBin > 0:
 840                             binID = 1
 841
 842                         if binID >= bins:
 843                             binID = bins - 1
 844
 845                         try:
 846                             regionsBins[regionID][binID] += normalizedTag * weight
 847                         except KeyError:
 848                             print "%s %s" % (regionID, str(binID))
 849
 850                     stopPoint = stop
 851
 852     return (regionsBins, regionsLen)