commoncode.py

   1 import ConfigParser
   2 import os
   3 import string
   4 from time import strftime
   5 from array import array
   6 from collections import defaultdict
   7 import Peak
   8 from cistematic.core.geneinfo import geneinfoDB
   9 from cistematic.genomes import Genome
  10 import Region
  11
  12 commoncodeVersion = 5.6
  13 currentRDSversion = 2.0
  14
  15 class ErangeError(Exception):
  16     pass
  17
  18
  19 def getReverseComplement(base):
  20     revComp = {"A": "T",
  21                "T": "A",
  22                "G": "C",
  23                "C": "G",
  24                "N": "N"
  25         }
  26
  27     return revComp[base]
  28
  29
  30 def countDuplicatesInList(listToCheck):
  31     tally = defaultdict(int)
  32     for item in listToCheck:
  33         tally[item] += 1
  34
  35     return tally.items()
  36
  37
  38 def writeLog(logFile, messenger, message):
  39     """ create a log file to write a message from a messenger or append to an existing file.
  40     """
  41     try:
  42         logfile = open(logFile)
  43     except IOError:
  44         logfile = open(logFile, "w")
  45     else:
  46         logfile = open(logFile, "a")
  47
  48     logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
  49     logfile.close()
  50
  51
  52 def getGeneInfoDict(genome, cache=False):
  53     idb = geneinfoDB(cache=cache)
  54     if genome == "dmelanogaster":
  55         geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
  56     else:
  57         geneinfoDict = idb.getallGeneInfo(genome)
  58
  59     return geneinfoDict
  60
  61
  62 def getGeneAnnotDict(genome, inRAM=False):
  63     return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
  64
  65
  66 def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
  67     genome = Genome(genomeName, inRAM=inRAM)
  68     if extendGenome != "":
  69         genome.extendFeatures(extendGenome, replace=replaceModels)
  70
  71     geneannotDict = genome.allAnnotInfo()
  72
  73     return geneannotDict
  74
  75
  76 def getConfigParser(fileList=[]):
  77     configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
  78     for filename in fileList:
  79         configFiles.append(filename)
  80
  81     config = ConfigParser.SafeConfigParser()
  82     config.read(configFiles)
  83
  84     return config
  85
  86
  87 def getConfigOption(parser, section, option, default=None):
  88     try:
  89         setting = parser.get(section, option)
  90     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
  91         setting = default
  92
  93     return setting
  94
  95
  96 def getConfigIntOption(parser, section, option, default=None):
  97     try:
  98         setting = parser.getint(section, option)
  99     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
 100         setting = default
 101
 102     return setting
 103
 104
 105 def getConfigFloatOption(parser, section, option, default=None):
 106     try:
 107         setting = parser.getfloat(section, option)
 108     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
 109         setting = default
 110
 111     return setting
 112
 113
 114 def getConfigBoolOption(parser, section, option, default=None):
 115     try:
 116         setting = parser.getboolean(section, option)
 117     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
 118         setting = default
 119
 120     return setting
 121
 122
 123 def getAllConfigSectionOptions(parser, section):
 124     try:
 125         setting = parser.items(section)
 126     except ConfigParser.NoSectionError:
 127         setting = []
 128
 129     return setting
 130
 131
 132 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
 133                      fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
 134                      doMerge=True, keepPeak=False, returnTop=0):
 135
 136     """ returns a dictionary containing a list of merged overlapping regions by chromosome;
 137     can optionally filter regions that have a scoreField fewer than minHits.
 138     Can also optionally return the label of each region, as well as the
 139     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
 140     Can return the top regions based on score if higher than minHits.
 141     """
 142     infile = open(regionfilename)
 143     lines = infile.readlines()
 144     regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
 145                                        fullChrom, chromField, scoreField, pad, compact,
 146                                        doMerge, keepPeak, returnTop)
 147
 148     infile.close()
 149
 150     return regions
 151
 152
 153 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
 154                      fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
 155                      doMerge=True, keepPeak=False, returnTop=0):
 156     """ returns a dictionary containing a list of merged overlapping regions by chromosome;
 157     can optionally filter regions that have a scoreField fewer than minHits.
 158     Can also optionally return the label of each region, as well as the
 159     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
 160     Can return the top regions based on score if higher than minHits.
 161     """
 162     regions = {}
 163     hasPvalue = 0
 164     hasShift = 0
 165     if 0 < returnTop < len(regionList):
 166         scores = []
 167         for regionEntry in regionList:
 168             if regionEntry[0] == "#":
 169                 if "pvalue" in regionEntry:
 170                     hasPvalue = 1
 171
 172                 if "readShift" in regionEntry:
 173                     hasShift = 1
 174
 175                 continue
 176
 177             fields = regionEntry.strip().split("\t")
 178             hits = float(fields[scoreField].strip())
 179             scores.append(hits)
 180
 181         scores.sort()
 182         returnTop = -1 * returnTop
 183         minScore = scores[returnTop]
 184         if minScore > minHits:
 185             minHits = minScore
 186
 187     mergeCount = 0
 188     chromField = int(chromField)
 189     count = 0
 190     #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
 191     #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
 192     #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
 193     #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
 194     #      be merged with A but that will exit the loop and the merge with C will be missed.
 195     for regionEntry in regionList:
 196         if regionEntry[0] == "#":
 197             if "pvalue" in regionEntry:
 198                 hasPvalue = 1
 199
 200             if "readShift" in regionEntry:
 201                 hasShift = 1
 202
 203             continue
 204
 205         fields = regionEntry.strip().split("\t")
 206         if minHits >= 0:
 207             try:
 208                 hits = float(fields[scoreField].strip())
 209             except (IndexError, ValueError):
 210                 continue
 211
 212             if hits < minHits:
 213                 continue
 214
 215         if compact:
 216             (chrom, pos) = fields[chromField].split(":")
 217             (front, back) = pos.split("-")
 218             start = int(front)
 219             stop = int(back)
 220         elif chromField > 1:
 221             label = string.join(fields[:chromField],"\t")
 222             chrom = fields[chromField]
 223             start = int(fields[chromField + 1]) - pad
 224             stop = int(fields[chromField + 2]) + pad
 225         else:
 226             label = fields[0]
 227             chrom = fields[1]
 228             start = int(fields[2]) - pad
 229             stop = int(fields[3]) + pad
 230
 231         if not fullChrom:
 232             chrom = chrom[3:]
 233
 234         if keepPeak:
 235             peakPos = int(fields[-2 - hasPvalue - hasShift])
 236             peakHeight = float(fields[-1 - hasPvalue - hasShift])
 237
 238         if chrom not in regions:
 239             regions[chrom] = []
 240
 241         merged = False
 242
 243         if doMerge and len(regions[chrom]) > 0:
 244             for index in range(len(regions[chrom])):
 245                 region = regions[chrom][index]
 246                 rstart = region.start
 247                 rstop = region.stop
 248                 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
 249                     if start < rstart:
 250                         rstart = start
 251
 252                     if rstop < stop:
 253                         rstop = stop
 254
 255                     if keepPeak:
 256                         rpeakPos = region.peakPos
 257                         rpeakHeight = region.peakHeight
 258                         if peakHeight > rpeakHeight:
 259                             rpeakHeight = peakHeight
 260                             rpeakPos = peakPos
 261
 262                     regions[chrom][index].start = rstart
 263                     regions[chrom][index].stop = rstop
 264                     regions[chrom][index].length = abs(rstop - rstart)
 265                     if keepLabel:
 266                         regions[chrom][index].label = label
 267
 268                     if keepPeak:
 269                         regions[chrom][index].peakPos = rpeakPos
 270                         regions[chrom][index].peakHeight = rpeakHeight
 271
 272
 273                     mergeCount += 1
 274                     merged = True
 275                     break
 276
 277         if not merged:
 278             region = Region.Region(start, stop)
 279             if keepLabel:
 280                 region.label = label
 281
 282             if keepPeak:
 283                 region.peakPos = peakPos
 284                 region.peakHeight = peakHeight
 285
 286             regions[chrom].append(region)
 287             count += 1
 288
 289         if verbose and (count % 100000 == 0):
 290             print count
 291
 292     regionCount = 0
 293     for chrom in regions:
 294         regionCount += len(regions[chrom])
 295         regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
 296
 297     if verbose:
 298         print "merged %d times" % mergeCount
 299         print "returning %d regions" % regionCount
 300
 301     return regions
 302
 303
 304 def regionsOverlap(start, stop, rstart, rstop):
 305     if start > stop:
 306         (start, stop) = (stop, start)
 307
 308     if rstart > rstop:
 309         (rstart, rstop) = (rstop, rstart)
 310
 311     return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
 312
 313
 314 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
 315     if start > stop:
 316         (start, stop) = (stop, start)
 317
 318     if rstart > rstop:
 319         (rstart, rstop) = (rstop, rstart)
 320
 321     return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
 322
 323
 324 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
 325              shift=0, maxshift=75):
 326     """ find the peak in a list of reads (hitlist) in a region
 327     of a given length and absolute start point. returns a
 328     list of peaks, the number of hits, a triangular-smoothed
 329     version of hitlist, and the number of reads that are
 330     forward (plus) sense.
 331     If doWeight is True, weight the reads accordingly.
 332     If leftPlus is True, return the number of plus reads left of
 333     the peak, taken to be the first TopPos position.
 334     """
 335
 336     if shift == "auto":
 337         shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
 338
 339     seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
 340
 341     # implementing a triangular smooth
 342     smoothArray = array("f", [0.] * length)
 343     for pos in range(2,length -2):
 344         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
 345
 346     topPos = getPeakPositionList(smoothArray, length)
 347     peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
 348
 349     if leftPlus:
 350         numLeftPlus = 0
 351         maxPos = topPos[0]
 352         for read in regionArray:
 353             if doWeight:
 354                 weight = read["weight"]
 355             else:
 356                 weight = 1.0
 357
 358             currentPos = read["start"] - start
 359             if currentPos <= maxPos and read["sense"] == "+":
 360                 numLeftPlus += weight
 361
 362         peak.numLeftPlus = numLeftPlus
 363
 364     return peak
 365
 366
 367 def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
 368     bestShift = 0
 369     lowestScore = 20000000000
 370     for testShift in xrange(maxShift + 1):
 371         shiftArray = array("f", [0.] * length)
 372         for read in readList:
 373             currentpos = read["start"] - start
 374             if read["sense"] == "+":
 375                 senseModifier = 1
 376             else:
 377                 senseModifier = -1
 378
 379             currentpos += testShift * senseModifier
 380             if (currentpos < 1) or (currentpos >= length):
 381                 continue
 382
 383             if useWeight:
 384                 weight = read["weight"]
 385             else:
 386                 weight = 1.0
 387
 388             shiftArray[currentpos] += weight * senseModifier
 389
 390         currentScore = 0
 391         for score in shiftArray:
 392             currentScore += abs(score)
 393
 394         if currentScore < lowestScore:
 395             bestShift = testShift
 396             lowestScore = currentScore
 397
 398     return bestShift
 399
 400
 401 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
 402     seqArray = array("f", [0.] * length)
 403     numHits = 0.
 404     numPlus = 0.
 405     regionArray = []
 406     for read in hitList:
 407         currentpos = read["start"] - start
 408         if read["sense"] == "+":
 409             currentpos += shift
 410         else:
 411             currentpos -= shift
 412
 413         if (currentpos <  1 - readlen) or (currentpos >= length):
 414             continue
 415
 416         if doWeight:
 417             weight = read["weight"]
 418         else:
 419             weight = 1.0
 420
 421         numHits += weight
 422         if leftPlus:
 423             regionArray.append(read)
 424
 425         hitIndex = 0
 426         while currentpos < 0:
 427             hitIndex += 1
 428             currentpos += 1
 429
 430         while hitIndex < readlen and currentpos < length:
 431             seqArray[currentpos] += weight
 432             hitIndex += 1
 433             currentpos += 1
 434
 435         if read["sense"] == "+":
 436             numPlus += weight
 437
 438     return seqArray, regionArray, numHits, numPlus
 439
 440
 441 def getPeakPositionList(smoothArray, length):
 442     topNucleotide = 0
 443     peakList = []
 444     for currentpos in xrange(length):
 445         if topNucleotide < smoothArray[currentpos]:
 446             topNucleotide = smoothArray[currentpos]
 447             peakList = [currentpos]
 448         elif topNucleotide == smoothArray[currentpos]:
 449             peakList.append(currentpos)
 450
 451     return peakList
 452
 453
 454 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
 455                            restrictList=[], regionComplement=False, maxStop=250000000):
 456     """ return a dictionary of cistematic gene features. Requires
 457     cistematic, obviously. Can filter-out pseudogenes. Will use
 458     additional regions dict to supplement gene models, if available.
 459     Can restrict output to a list of GIDs.
 460     If regionComplement is set to true, returns the regions *outside* of the
 461     calculated boundaries, which is useful for retrieving intronic and
 462     intergenic regions. maxStop is simply used to define the uppermost
 463     boundary of the complement region.
 464     """
 465     featuresDict = genomeObject.getallGeneFeatures()
 466     restrictGID = False
 467     if len(restrictList) > 0:
 468         restrictGID = True
 469
 470     if len(additionalRegionsDict) > 0:
 471         sortList = []
 472         for chrom in additionalRegionsDict:
 473             for region in additionalRegionsDict[chrom]:
 474                 label = region.label
 475                 if label not in sortList:
 476                     sortList.append(label)
 477
 478                 if label not in featuresDict:
 479                     featuresDict[label] = []
 480                     sense = "+"
 481                 else:
 482                     sense = featuresDict[label][0][-1]
 483
 484                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
 485
 486         for gid in sortList:
 487             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
 488
 489     featuresByChromDict = {}
 490     for gid in featuresDict:
 491         if restrictGID and gid not in restrictList:
 492             continue
 493
 494         newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
 495         if newFeatureList and chrom not in featuresByChromDict:
 496             featuresByChromDict[chrom] = []
 497
 498         for (start, stop, ftype) in newFeatureList:
 499             featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
 500
 501     for chrom in featuresByChromDict:
 502         featuresByChromDict[chrom].sort()
 503
 504     if regionComplement:
 505         complementByChromDict = {}
 506         complementIndex = 0
 507         for chrom in featuresByChromDict:
 508             complementByChromDict[chrom] = []
 509             listLength = len(featuresByChromDict[chrom])
 510             if listLength > 0:
 511                 currentStart = 0
 512                 for index in range(listLength):
 513                     currentStop = featuresByChromDict[chrom][index][0]
 514                     complementIndex += 1
 515                     if currentStart < currentStop:
 516                         complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
 517
 518                     currentStart = featuresByChromDict[chrom][index][1]
 519
 520                 currentStop = maxStop
 521                 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
 522
 523         return (featuresByChromDict, complementByChromDict)
 524     else:
 525         return featuresByChromDict
 526
 527
 528 def getNewFeatureList(featureList, ignorePseudo=False):
 529     newFeatureList = []
 530     for (featureType, chrom, start, stop, sense) in featureList:
 531         if featureType == "PSEUDO" and ignorePseudo:
 532             return [], chrom, sense
 533
 534         if (start, stop, featureType) not in newFeatureList:
 535             notContained = True
 536             containedList = []
 537             for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
 538                 if start >= newFeatureStart and stop <= newFeatureStop:
 539                     notContained = False
 540
 541                 if start < newFeatureStart and stop > newFeatureStop:
 542                     containedList.append((newFeatureStart, newFeatureStop))
 543
 544             if len(containedList) > 0:
 545                 newFList = []
 546                 notContained = True
 547                 for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
 548                     if (newFeatureStart, newFeatureStop) not in containedList:
 549                         newFList.append((newFeatureStart, newFeatureStop, featureType2))
 550                         if start >= newFeatureStart and stop <= newFeatureStop:
 551                             notContained = False
 552
 553                 newFeatureList = newFList
 554
 555             if notContained:
 556                 newFeatureList.append((start, stop, featureType))
 557
 558     return newFeatureList, chrom, sense
 559
 560
 561 def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
 562                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
 563                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
 564     """ return a dictionary of gene loci. Can be used to retrieve additional
 565     sequence upstream or downstream of gene, up to the next gene. Requires
 566     cistematic, obviously.
 567     Can filter-out pseudogenes and use additional regions outside of existing
 568     gene models. Use upstreamSpanTSS to overlap half of the upstream region
 569     over the TSS.
 570     If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
 571     lengthCDS < 0bp, return only the last X bp from CDS.
 572     """
 573     locusByChromDict = {}
 574     if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
 575         return locusByChromDict
 576
 577     genomeName = genome.genome
 578     featuresDict = getGeneFeatures(genome, additionalRegionsDict)
 579     for gid in featuresDict:
 580         featureList = featuresDict[gid]
 581         newFeatureList = []
 582         for (ftype, chrom, start, stop, sense) in featureList:
 583             newFeatureList.append((start, stop))
 584
 585         if ignorePseudo and ftype == "PSEUDO":
 586             continue
 587
 588         newFeatureList.sort()
 589
 590         sense = featureList[0][-1]
 591         gstart = newFeatureList[0][0]
 592         gstop = newFeatureList[-1][1]
 593         glen = abs(gstart - gstop)
 594         if sense == "F":
 595             if not useCDS and upstream > 0:
 596                 if upstreamSpanTSS:
 597                     if gstop > (gstart + upstream / 2):
 598                         gstop = gstart + upstream / 2
 599                 else:
 600                     gstop = gstart
 601             elif not useCDS and downstream > 0:
 602                 gstart = gstop
 603
 604             if upstream > 0:
 605                 if upstreamSpanTSS:
 606                     distance = upstream / 2
 607                 else:
 608                     distance = upstream
 609
 610                 if adjustToNeighbor:
 611                     nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
 612                     if nextGene < distance * 2:
 613                         distance = nextGene / 2
 614
 615                 distance = max(distance, 1)
 616                 gstart -= distance
 617
 618             if downstream > 0:
 619                 distance = downstream
 620                 if adjustToNeighbor:
 621                     nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
 622                     if nextGene < downstream * 2:
 623                         distance = nextGene / 2
 624
 625                 distance = max(distance, 1)
 626                 gstop += distance
 627
 628             if 0 < lengthCDS < glen:
 629                 gstop = newFeatureList[0][0] + lengthCDS
 630
 631             if lengthCDS < 0 and abs(lengthCDS) < glen:
 632                 gstart = newFeatureList[-1][1] + lengthCDS
 633         else:
 634             if not useCDS and upstream > 0:
 635                 if upstreamSpanTSS:
 636                     if gstart < (gstop - upstream / 2):
 637                         gstart = gstop - upstream / 2
 638                 else:
 639                     gstart = gstop
 640             elif not useCDS and downstream > 0:
 641                     gstop = gstart
 642
 643             if upstream > 0:
 644                 if upstreamSpanTSS:
 645                     distance = upstream /2
 646                 else:
 647                     distance = upstream
 648
 649                 if adjustToNeighbor:
 650                     nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
 651                     if nextGene < distance * 2:
 652                         distance = nextGene / 2
 653
 654                 distance = max(distance, 1)
 655                 gstop += distance
 656
 657             if downstream > 0:
 658                 distance = downstream
 659                 if adjustToNeighbor:
 660                     nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
 661                     if nextGene < downstream * 2:
 662                         distance = nextGene / 2
 663
 664                 distance = max(distance, 1)
 665                 gstart -= distance
 666
 667             if 0 < lengthCDS < glen:
 668                 gstart = newFeatureList[-1][-1] - lengthCDS
 669
 670             if lengthCDS < 0 and abs(lengthCDS) < glen:
 671                 gstop = newFeatureList[0][0] - lengthCDS
 672
 673         if chrom not in locusByChromDict:
 674             locusByChromDict[chrom] = []
 675
 676         if keepSense:
 677             locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
 678         else:
 679             locusByChromDict[chrom].append((gstart, gstop, gid, glen))
 680
 681     for chrom in locusByChromDict:
 682         locusByChromDict[chrom].sort()
 683
 684     return locusByChromDict
 685
 686
 687 def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
 688     if upstream == 0 and downstream == 0 and not useCDS:
 689         print "getLocusByChromDict: asked for no sequence - returning empty dict"
 690         return True
 691     elif upstream > 0 and downstream > 0 and not useCDS:
 692         print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
 693         return True
 694     elif lengthCDS != 0 and not useCDS:
 695         print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
 696         return True
 697     elif upstreamSpanTSS and lengthCDS != 0:
 698         print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
 699         return True
 700     elif lengthCDS > 0 and downstream > 0:
 701         print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
 702         return True
 703     elif lengthCDS < 0 and upstream > 0:
 704         print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
 705         return True
 706
 707     return False
 708
 709
 710 def getGeneFeatures(genome, additionalRegionsDict):
 711     featuresDict = genome.getallGeneFeatures()
 712     if len(additionalRegionsDict) > 0:
 713         sortList = []
 714         for chrom in additionalRegionsDict:
 715             for region in additionalRegionsDict[chrom]:
 716                 label = region.label
 717                 if label not in sortList:
 718                     sortList.append(label)
 719
 720                 if label not in featuresDict:
 721                     featuresDict[label] = []
 722                     sense = "+"
 723                 else:
 724                     sense = featuresDict[label][0][-1]
 725
 726                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
 727
 728         for gid in sortList:
 729             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
 730
 731     return featuresDict
 732
 733
 734 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
 735                       normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
 736                       binLength=-1):
 737     """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
 738         a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
 739         or a different weight / tag.
 740     """
 741     index = 0
 742     regionsBins = {}
 743     regionsLen = {}
 744
 745     if defaultRegionFormat:
 746         regionIDField = 0
 747         startField = 1
 748         stopField = 2
 749         lengthField = 3
 750     else:
 751         startField = 0
 752         stopField = 1
 753         regionIDField = 2
 754         lengthField = 3
 755
 756     senseField = 4
 757
 758     print "entering computeRegionBins"
 759     if len(regionList) > 0:
 760         for readID in regionList:
 761             regionsBins[readID] = [0.] * bins
 762     else:
 763         for chrom in regionsByChromDict:
 764             for regionTuple in regionsByChromDict[chrom]:
 765                 regionID = regionTuple[regionIDField]
 766                 regionsBins[regionID] = [0.] * bins
 767
 768     for chrom in hitDict:
 769         if chrom not in regionsByChromDict:
 770             continue
 771
 772         for regionTuple in regionsByChromDict[chrom]:
 773             regionID = regionTuple[regionIDField]
 774             regionsLen[regionID] = regionTuple[lengthField]
 775
 776         print "%s\n" % chrom
 777         startRegion = 0
 778         for read in hitDict[chrom]:
 779             tagStart = read["start"]
 780             weight = read["weight"]
 781             index += 1
 782             if index % 100000 == 0:
 783                 print "read %d " % index,
 784
 785             stopPoint = tagStart + readlen
 786             if startRegion < 0:
 787                 startRegion = 0
 788
 789             for regionTuple in regionsByChromDict[chrom][startRegion:]:
 790                 start = regionTuple[startField]
 791                 stop = regionTuple[stopField]
 792                 regionID = regionTuple[regionIDField]
 793                 rlen = regionTuple[lengthField]
 794                 try:
 795                     rsense = regionTuple[senseField]
 796                 except IndexError:
 797                     rsense = "F"
 798
 799                 if tagStart > stop:
 800                     startRegion += 1
 801                     continue
 802
 803                 if start > stopPoint:
 804                     startRegion -= 10
 805                     break
 806
 807                 if start <= tagStart <= stop:
 808                     if binLength < 1:
 809                         regionBinLength = rlen / bins
 810                     else:
 811                         regionBinLength = binLength
 812
 813                     if rsense == "F":
 814                         positionInRegion = tagStart - start
 815                     else:
 816                         positionInRegion = start + rlen - tagStart
 817
 818                     # we are relying on python's integer division quirk
 819                     binID = positionInRegion / regionBinLength
 820                     if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
 821                         binID = 0
 822                     elif fixedFirstBin > 0:
 823                         binID = 1
 824
 825                     if binID >= bins:
 826                         binID = bins - 1
 827
 828                     try:
 829                         regionsBins[regionID][binID] += normalizedTag * weight
 830                     except KeyError:
 831                         print "%s %s" % (regionID, str(binID))
 832
 833                     stopPoint = stop
 834
 835     return (regionsBins, regionsLen)