commoncode.py

   1 import ConfigParser
   2 import os
   3 import string
   4 from time import strftime
   5 from array import array
   6 from collections import defaultdict
   7 import Peak
   8 from cistematic.core.geneinfo import geneinfoDB
   9 from cistematic.genomes import Genome
  10 import Region
  11
  12 commoncodeVersion = 5.6
  13 currentRDSversion = 2.0
  14
  15 class ErangeError(Exception):
  16     pass
  17
  18
  19 def getReverseComplement(base):
  20     revComp = {"A": "T",
  21                "T": "A",
  22                "G": "C",
  23                "C": "G",
  24                "N": "N"
  25         }
  26
  27     return revComp[base]
  28
  29
  30 def countDuplicatesInList(listToCheck):
  31     tally = defaultdict(int)
  32     for item in listToCheck:
  33         tally[item] += 1
  34
  35     return tally.items()
  36
  37
  38 def writeLog(logFile, messenger, message):
  39     """ create a log file to write a message from a messenger or append to an existing file.
  40     """
  41     try:
  42         logfile = open(logFile)
  43     except IOError:
  44         logfile = open(logFile, "w")
  45     else:
  46         logfile = open(logFile, "a")
  47
  48     logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
  49     logfile.close()
  50
  51
  52 def getGeneInfoDict(genome, cache=False):
  53     idb = geneinfoDB(cache=cache)
  54     if genome == "dmelanogaster":
  55         geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
  56     else:
  57         geneinfoDict = idb.getallGeneInfo(genome)
  58
  59     return geneinfoDict
  60
  61
  62 def getGeneAnnotDict(genome, inRAM=False):
  63     return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
  64
  65
  66 def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
  67     genome = Genome(genomeName, inRAM=inRAM)
  68     if extendGenome != "":
  69         genome.extendFeatures(extendGenome, replace=replaceModels)
  70
  71     geneannotDict = genome.allAnnotInfo()
  72
  73     return geneannotDict
  74
  75
  76 def getConfigParser(fileList=[]):
  77     configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
  78     for filename in fileList:
  79         configFiles.append(filename)
  80
  81     config = ConfigParser.SafeConfigParser()
  82     config.read(configFiles)
  83
  84     return config
  85
  86
  87 def getConfigOption(parser, section, option, default=None):
  88     try:
  89         setting = parser.get(section, option)
  90     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
  91         setting = default
  92
  93     return setting
  94
  95
  96 def getConfigIntOption(parser, section, option, default=None):
  97     try:
  98         setting = parser.getint(section, option)
  99     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
 100         setting = default
 101
 102     return setting
 103
 104
 105 def getConfigFloatOption(parser, section, option, default=None):
 106     try:
 107         setting = parser.getfloat(section, option)
 108     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
 109         setting = default
 110
 111     return setting
 112
 113
 114 def getConfigBoolOption(parser, section, option, default=None):
 115     try:
 116         setting = parser.getboolean(section, option)
 117     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
 118         setting = default
 119
 120     return setting
 121
 122
 123 def getAllConfigSectionOptions(parser, section):
 124     try:
 125         setting = parser.items(section)
 126     except ConfigParser.NoSectionError:
 127         setting = []
 128
 129     return setting
 130
 131
 132 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
 133                      fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
 134                      doMerge=True, keepPeak=False, returnTop=0):
 135
 136     """ returns a dictionary containing a list of merged overlapping regions by chromosome;
 137     can optionally filter regions that have a scoreField fewer than minHits.
 138     Can also optionally return the label of each region, as well as the
 139     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
 140     Can return the top regions based on score if higher than minHits.
 141     """
 142     infile = open(regionfilename)
 143     lines = infile.readlines()
 144     regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
 145                                        fullChrom, chromField, scoreField, pad, compact,
 146                                        doMerge, keepPeak, returnTop)
 147
 148     infile.close()
 149
 150     return regions
 151
 152
 153 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
 154                      fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
 155                      doMerge=True, keepPeak=False, returnTop=0):
 156     """ returns a dictionary containing a list of merged overlapping regions by chromosome;
 157     can optionally filter regions that have a scoreField fewer than minHits.
 158     Can also optionally return the label of each region, as well as the
 159     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
 160     Can return the top regions based on score if higher than minHits.
 161     """
 162     regions = {}
 163     hasPvalue = 0
 164     hasShift = 0
 165     if 0 < returnTop < len(regionList):
 166         scores = []
 167         for regionEntry in regionList:
 168             if regionEntry[0] == "#":
 169                 if "pvalue" in regionEntry:
 170                     hasPvalue = 1
 171
 172                 if "readShift" in regionEntry:
 173                     hasShift = 1
 174
 175                 continue
 176
 177             fields = regionEntry.strip().split("\t")
 178             hits = float(fields[scoreField].strip())
 179             scores.append(hits)
 180
 181         scores.sort()
 182         returnTop = -1 * returnTop
 183         minScore = scores[returnTop]
 184         if minScore > minHits:
 185             minHits = minScore
 186
 187     mergeCount = 0
 188     chromField = int(chromField)
 189     count = 0
 190     #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
 191     #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
 192     #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
 193     #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
 194     #      be merged with A but that will exit the loop and the merge with C will be missed.
 195     #TODO: QForAli - Are we going to require sorted region files to have this even work?
 196     for regionEntry in regionList:
 197         if regionEntry[0] == "#":
 198             if "pvalue" in regionEntry:
 199                 hasPvalue = 1
 200
 201             if "readShift" in regionEntry:
 202                 hasShift = 1
 203
 204             continue
 205
 206         fields = regionEntry.strip().split("\t")
 207         if minHits >= 0:
 208             try:
 209                 hits = float(fields[scoreField].strip())
 210             except (IndexError, ValueError):
 211                 continue
 212
 213             if hits < minHits:
 214                 continue
 215
 216         if compact:
 217             (chrom, pos) = fields[chromField].split(":")
 218             (front, back) = pos.split("-")
 219             start = int(front)
 220             stop = int(back)
 221         elif chromField > 1:
 222             label = string.join(fields[:chromField],"\t")
 223             chrom = fields[chromField]
 224             start = int(fields[chromField + 1]) - pad
 225             stop = int(fields[chromField + 2]) + pad
 226         else:
 227             label = fields[0]
 228             chrom = fields[1]
 229             start = int(fields[2]) - pad
 230             stop = int(fields[3]) + pad
 231
 232         if not fullChrom:
 233             chrom = chrom[3:]
 234
 235         if keepPeak:
 236             peakPos = int(fields[-2 - hasPvalue - hasShift])
 237             peakHeight = float(fields[-1 - hasPvalue - hasShift])
 238
 239         if chrom not in regions:
 240             regions[chrom] = []
 241
 242         merged = False
 243
 244         if doMerge and len(regions[chrom]) > 0:
 245             for index in range(len(regions[chrom])):
 246                 region = regions[chrom][index]
 247                 rstart = region.start
 248                 rstop = region.stop
 249                 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
 250                     if start < rstart:
 251                         rstart = start
 252
 253                     if rstop < stop:
 254                         rstop = stop
 255
 256                     if keepPeak:
 257                         rpeakPos = region.peakPos
 258                         rpeakHeight = region.peakHeight
 259                         if peakHeight > rpeakHeight:
 260                             rpeakHeight = peakHeight
 261                             rpeakPos = peakPos
 262
 263                     regions[chrom][index].start = rstart
 264                     regions[chrom][index].stop = rstop
 265                     regions[chrom][index].length = abs(rstop - rstart)
 266                     if keepLabel:
 267                         regions[chrom][index].label = label
 268
 269                     if keepPeak:
 270                         regions[chrom][index].peakPos = rpeakPos
 271                         regions[chrom][index].peakHeight = rpeakHeight
 272
 273
 274                     mergeCount += 1
 275                     merged = True
 276                     break
 277
 278         if not merged:
 279             region = Region.Region(start, stop)
 280             if keepLabel:
 281                 region.label = label
 282
 283             if keepPeak:
 284                 region.peakPos = peakPos
 285                 region.peakHeight = peakHeight
 286
 287             regions[chrom].append(region)
 288             count += 1
 289
 290         if verbose and (count % 100000 == 0):
 291             print count
 292
 293     regionCount = 0
 294     for chrom in regions:
 295         regionCount += len(regions[chrom])
 296         regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
 297
 298     if verbose:
 299         print "merged %d times" % mergeCount
 300         print "returning %d regions" % regionCount
 301
 302     return regions
 303
 304
 305 def regionsOverlap(start, stop, rstart, rstop):
 306     if start > stop:
 307         (start, stop) = (stop, start)
 308
 309     if rstart > rstop:
 310         (rstart, rstop) = (rstop, rstart)
 311
 312     return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
 313
 314
 315 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
 316     if start > stop:
 317         (start, stop) = (stop, start)
 318
 319     if rstart > rstop:
 320         (rstart, rstop) = (rstop, rstart)
 321
 322     return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
 323
 324
 325 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
 326              shift=0, maxshift=75):
 327     """ find the peak in a list of reads (hitlist) in a region
 328     of a given length and absolute start point. returns a
 329     list of peaks, the number of hits, a triangular-smoothed
 330     version of hitlist, and the number of reads that are
 331     forward (plus) sense.
 332     If doWeight is True, weight the reads accordingly.
 333     If leftPlus is True, return the number of plus reads left of
 334     the peak, taken to be the first TopPos position.
 335     """
 336
 337     if shift == "auto":
 338         shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
 339
 340     seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
 341
 342     # implementing a triangular smooth
 343     smoothArray = array("f", [0.] * length)
 344     #TODO: QForAli - really?  We're going to insert MANUFACTURED data (0.0's) and use them in the calculation
 345     # (probably) just to avoid IndexError exceptions.  Then we are going to completely ignore adjusting the
 346     # last 2 elements of the array for (probably) the same issue.  Is this how erange is dealing with the
 347     # edge cases?
 348     for pos in range(2,length -2):
 349         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
 350
 351     topPos = getPeakPositionList(smoothArray, length)
 352     peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
 353
 354     if leftPlus:
 355         numLeftPlus = 0
 356         maxPos = topPos[0]
 357         for read in regionArray:
 358             if doWeight:
 359                 weight = read["weight"]
 360             else:
 361                 weight = 1.0
 362
 363             currentPos = read["start"] - start
 364             if currentPos <= maxPos and read["sense"] == "+":
 365                 numLeftPlus += weight
 366
 367         peak.numLeftPlus = numLeftPlus
 368
 369     return peak
 370
 371
 372 def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
 373     bestShift = 0
 374     lowestScore = 20000000000
 375     for testShift in xrange(maxShift + 1):
 376         shiftArray = array("f", [0.] * length)
 377         for read in readList:
 378             currentpos = read["start"] - start
 379             if read["sense"] == "+":
 380                 senseModifier = 1
 381             else:
 382                 senseModifier = -1
 383
 384             currentpos += testShift * senseModifier
 385             if (currentpos < 1) or (currentpos >= length):
 386                 continue
 387
 388             if useWeight:
 389                 weight = read["weight"]
 390             else:
 391                 weight = 1.0
 392
 393             shiftArray[currentpos] += weight * senseModifier
 394
 395         currentScore = 0
 396         for score in shiftArray:
 397             currentScore += abs(score)
 398
 399         if currentScore < lowestScore:
 400             bestShift = testShift
 401             lowestScore = currentScore
 402
 403     return bestShift
 404
 405
 406 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
 407     seqArray = array("f", [0.] * length)
 408     numHits = 0.
 409     numPlus = 0.
 410     regionArray = []
 411     for read in hitList:
 412         currentpos = read["start"] - start
 413         if read["sense"] == "+":
 414             currentpos += shift
 415         else:
 416             currentpos -= shift
 417
 418         if (currentpos <  1 - readlen) or (currentpos >= length):
 419             continue
 420
 421         if doWeight:
 422             weight = read["weight"]
 423         else:
 424             weight = 1.0
 425
 426         numHits += weight
 427         if leftPlus:
 428             regionArray.append(read)
 429
 430         hitIndex = 0
 431         while currentpos < 0:
 432             hitIndex += 1
 433             currentpos += 1
 434
 435         while hitIndex < readlen and currentpos < length:
 436             seqArray[currentpos] += weight
 437             hitIndex += 1
 438             currentpos += 1
 439
 440         if read["sense"] == "+":
 441             numPlus += weight
 442
 443     return seqArray, regionArray, numHits, numPlus
 444
 445
 446 def getPeakPositionList(smoothArray, length):
 447     topNucleotide = 0
 448     peakList = []
 449     for currentpos in xrange(length):
 450         if topNucleotide < smoothArray[currentpos]:
 451             topNucleotide = smoothArray[currentpos]
 452             peakList = [currentpos]
 453         elif topNucleotide == smoothArray[currentpos]:
 454             peakList.append(currentpos)
 455
 456     return peakList
 457
 458
 459 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
 460                            restrictList=[], regionComplement=False, maxStop=250000000):
 461     """ return a dictionary of cistematic gene features. Requires
 462     cistematic, obviously. Can filter-out pseudogenes. Will use
 463     additional regions dict to supplement gene models, if available.
 464     Can restrict output to a list of GIDs.
 465     If regionComplement is set to true, returns the regions *outside* of the
 466     calculated boundaries, which is useful for retrieving intronic and
 467     intergenic regions. maxStop is simply used to define the uppermost
 468     boundary of the complement region.
 469     """
 470     featuresDict = genomeObject.getallGeneFeatures()
 471     restrictGID = False
 472     if len(restrictList) > 0:
 473         restrictGID = True
 474
 475     if len(additionalRegionsDict) > 0:
 476         sortList = []
 477         for chrom in additionalRegionsDict:
 478             for region in additionalRegionsDict[chrom]:
 479                 label = region.label
 480                 if label not in sortList:
 481                     sortList.append(label)
 482
 483                 if label not in featuresDict:
 484                     featuresDict[label] = []
 485                     sense = "+"
 486                 else:
 487                     sense = featuresDict[label][0][-1]
 488
 489                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
 490
 491         for gid in sortList:
 492             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
 493
 494     featuresByChromDict = {}
 495     for gid in featuresDict:
 496         if restrictGID and gid not in restrictList:
 497             continue
 498
 499         newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
 500         if newFeatureList and chrom not in featuresByChromDict:
 501             featuresByChromDict[chrom] = []
 502
 503         for (start, stop, ftype) in newFeatureList:
 504             featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
 505
 506     for chrom in featuresByChromDict:
 507         featuresByChromDict[chrom].sort()
 508
 509     if regionComplement:
 510         complementByChromDict = {}
 511         complementIndex = 0
 512         for chrom in featuresByChromDict:
 513             complementByChromDict[chrom] = []
 514             listLength = len(featuresByChromDict[chrom])
 515             if listLength > 0:
 516                 currentStart = 0
 517                 for index in range(listLength):
 518                     currentStop = featuresByChromDict[chrom][index][0]
 519                     complementIndex += 1
 520                     if currentStart < currentStop:
 521                         complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
 522
 523                     currentStart = featuresByChromDict[chrom][index][1]
 524
 525                 currentStop = maxStop
 526                 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
 527
 528         return (featuresByChromDict, complementByChromDict)
 529     else:
 530         return featuresByChromDict
 531
 532
 533 def getNewFeatureList(featureList, ignorePseudo=False):
 534     newFeatureList = []
 535     for (featureType, chrom, start, stop, sense) in featureList:
 536         if featureType == "PSEUDO" and ignorePseudo:
 537             return [], chrom, sense
 538
 539         if (start, stop, featureType) not in newFeatureList:
 540             notContained = True
 541             containedList = []
 542             for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
 543                 if start >= newFeatureStart and stop <= newFeatureStop:
 544                     notContained = False
 545
 546                 if start < newFeatureStart and stop > newFeatureStop:
 547                     containedList.append((newFeatureStart, newFeatureStop))
 548
 549             if len(containedList) > 0:
 550                 newFList = []
 551                 notContained = True
 552                 for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
 553                     if (newFeatureStart, newFeatureStop) not in containedList:
 554                         newFList.append((newFeatureStart, newFeatureStop, featureType2))
 555                         if start >= newFeatureStart and stop <= newFeatureStop:
 556                             notContained = False
 557
 558                 newFeatureList = newFList
 559
 560             if notContained:
 561                 newFeatureList.append((start, stop, featureType))
 562
 563     return newFeatureList, chrom, sense
 564
 565
 566 def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
 567                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
 568                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
 569     """ return a dictionary of gene loci. Can be used to retrieve additional
 570     sequence upstream or downstream of gene, up to the next gene. Requires
 571     cistematic, obviously.
 572     Can filter-out pseudogenes and use additional regions outside of existing
 573     gene models. Use upstreamSpanTSS to overlap half of the upstream region
 574     over the TSS.
 575     If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
 576     lengthCDS < 0bp, return only the last X bp from CDS.
 577     """
 578     locusByChromDict = {}
 579     if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
 580         return locusByChromDict
 581
 582     genomeName = genome.genome
 583     featuresDict = getGeneFeatures(genome, additionalRegionsDict)
 584     for gid in featuresDict:
 585         featureList = featuresDict[gid]
 586         newFeatureList = []
 587         for (ftype, chrom, start, stop, sense) in featureList:
 588             newFeatureList.append((start, stop))
 589
 590         if ignorePseudo and ftype == "PSEUDO":
 591             continue
 592
 593         newFeatureList.sort()
 594
 595         sense = featureList[0][-1]
 596         gstart = newFeatureList[0][0]
 597         gstop = newFeatureList[-1][1]
 598         glen = abs(gstart - gstop)
 599         if sense == "F":
 600             if not useCDS and upstream > 0:
 601                 if upstreamSpanTSS:
 602                     if gstop > (gstart + upstream / 2):
 603                         gstop = gstart + upstream / 2
 604                 else:
 605                     gstop = gstart
 606             elif not useCDS and downstream > 0:
 607                 gstart = gstop
 608
 609             if upstream > 0:
 610                 if upstreamSpanTSS:
 611                     distance = upstream / 2
 612                 else:
 613                     distance = upstream
 614
 615                 if adjustToNeighbor:
 616                     nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
 617                     if nextGene < distance * 2:
 618                         distance = nextGene / 2
 619
 620                 distance = max(distance, 1)
 621                 gstart -= distance
 622
 623             if downstream > 0:
 624                 distance = downstream
 625                 if adjustToNeighbor:
 626                     nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
 627                     if nextGene < downstream * 2:
 628                         distance = nextGene / 2
 629
 630                 distance = max(distance, 1)
 631                 gstop += distance
 632
 633             if 0 < lengthCDS < glen:
 634                 gstop = newFeatureList[0][0] + lengthCDS
 635
 636             if lengthCDS < 0 and abs(lengthCDS) < glen:
 637                 gstart = newFeatureList[-1][1] + lengthCDS
 638         else:
 639             if not useCDS and upstream > 0:
 640                 if upstreamSpanTSS:
 641                     if gstart < (gstop - upstream / 2):
 642                         gstart = gstop - upstream / 2
 643                 else:
 644                     gstart = gstop
 645             elif not useCDS and downstream > 0:
 646                     gstop = gstart
 647
 648             if upstream > 0:
 649                 if upstreamSpanTSS:
 650                     distance = upstream /2
 651                 else:
 652                     distance = upstream
 653
 654                 if adjustToNeighbor:
 655                     nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
 656                     if nextGene < distance * 2:
 657                         distance = nextGene / 2
 658
 659                 distance = max(distance, 1)
 660                 gstop += distance
 661
 662             if downstream > 0:
 663                 distance = downstream
 664                 if adjustToNeighbor:
 665                     nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
 666                     if nextGene < downstream * 2:
 667                         distance = nextGene / 2
 668
 669                 distance = max(distance, 1)
 670                 gstart -= distance
 671
 672             if 0 < lengthCDS < glen:
 673                 gstart = newFeatureList[-1][-1] - lengthCDS
 674
 675             if lengthCDS < 0 and abs(lengthCDS) < glen:
 676                 gstop = newFeatureList[0][0] - lengthCDS
 677
 678         if chrom not in locusByChromDict:
 679             locusByChromDict[chrom] = []
 680
 681         if keepSense:
 682             locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
 683         else:
 684             locusByChromDict[chrom].append((gstart, gstop, gid, glen))
 685
 686     for chrom in locusByChromDict:
 687         locusByChromDict[chrom].sort()
 688
 689     return locusByChromDict
 690
 691
 692 def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
 693     if upstream == 0 and downstream == 0 and not useCDS:
 694         print "getLocusByChromDict: asked for no sequence - returning empty dict"
 695         return True
 696     elif upstream > 0 and downstream > 0 and not useCDS:
 697         print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
 698         return True
 699     elif lengthCDS != 0 and not useCDS:
 700         print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
 701         return True
 702     elif upstreamSpanTSS and lengthCDS != 0:
 703         print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
 704         return True
 705     elif lengthCDS > 0 and downstream > 0:
 706         print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
 707         return True
 708     elif lengthCDS < 0 and upstream > 0:
 709         print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
 710         return True
 711
 712     return False
 713
 714
 715 def getGeneFeatures(genome, additionalRegionsDict):
 716     featuresDict = genome.getallGeneFeatures()
 717     if len(additionalRegionsDict) > 0:
 718         sortList = []
 719         for chrom in additionalRegionsDict:
 720             for region in additionalRegionsDict[chrom]:
 721                 label = region.label
 722                 if label not in sortList:
 723                     sortList.append(label)
 724
 725                 if label not in featuresDict:
 726                     featuresDict[label] = []
 727                     sense = "+"
 728                 else:
 729                     sense = featuresDict[label][0][-1]
 730
 731                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
 732
 733         for gid in sortList:
 734             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
 735
 736     return featuresDict
 737
 738
 739 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
 740                       normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
 741                       binLength=-1):
 742     """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
 743         a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
 744         or a different weight / tag.
 745     """
 746     index = 0
 747     regionsBins = {}
 748     regionsLen = {}
 749
 750     if defaultRegionFormat:
 751         regionIDField = 0
 752         startField = 1
 753         stopField = 2
 754         lengthField = 3
 755     else:
 756         startField = 0
 757         stopField = 1
 758         regionIDField = 2
 759         lengthField = 3
 760
 761     senseField = 4
 762
 763     print "entering computeRegionBins"
 764     if len(regionList) > 0:
 765         for readID in regionList:
 766             regionsBins[readID] = [0.] * bins
 767     else:
 768         for chrom in regionsByChromDict:
 769             for regionTuple in regionsByChromDict[chrom]:
 770                 regionID = regionTuple[regionIDField]
 771                 regionsBins[regionID] = [0.] * bins
 772
 773     for chrom in hitDict:
 774         if chrom not in regionsByChromDict:
 775             continue
 776
 777         for regionTuple in regionsByChromDict[chrom]:
 778             regionID = regionTuple[regionIDField]
 779             regionsLen[regionID] = regionTuple[lengthField]
 780
 781         print "%s\n" % chrom
 782         startRegion = 0
 783         for read in hitDict[chrom]:
 784             tagStart = read["start"]
 785             weight = read["weight"]
 786             index += 1
 787             if index % 100000 == 0:
 788                 print "read %d " % index,
 789
 790             stopPoint = tagStart + readlen
 791             if startRegion < 0:
 792                 startRegion = 0
 793
 794             for regionTuple in regionsByChromDict[chrom][startRegion:]:
 795                 start = regionTuple[startField]
 796                 stop = regionTuple[stopField]
 797                 regionID = regionTuple[regionIDField]
 798                 rlen = regionTuple[lengthField]
 799                 try:
 800                     rsense = regionTuple[senseField]
 801                 except IndexError:
 802                     rsense = "F"
 803
 804                 if tagStart > stop:
 805                     startRegion += 1
 806                     continue
 807
 808                 if start > stopPoint:
 809                     startRegion -= 10
 810                     break
 811
 812                 if start <= tagStart <= stop:
 813                     if binLength < 1:
 814                         regionBinLength = rlen / bins
 815                     else:
 816                         regionBinLength = binLength
 817
 818                     if rsense == "F":
 819                         positionInRegion = tagStart - start
 820                     else:
 821                         positionInRegion = start + rlen - tagStart
 822
 823                     # we are relying on python's integer division quirk
 824                     binID = positionInRegion / regionBinLength
 825                     if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
 826                         binID = 0
 827                     elif fixedFirstBin > 0:
 828                         binID = 1
 829
 830                     if binID >= bins:
 831                         binID = bins - 1
 832
 833                     try:
 834                         regionsBins[regionID][binID] += normalizedTag * weight
 835                     except KeyError:
 836                         print "%s %s" % (regionID, str(binID))
 837
 838                     stopPoint = stop
 839
 840     return (regionsBins, regionsLen)