commoncode.py

   1 import ConfigParser
   2 import os
   3 import string
   4 from time import strftime
   5 from array import array
   6 from collections import defaultdict
   7 import Peak
   8 from cistematic.core.geneinfo import geneinfoDB
   9 from cistematic.genomes import Genome
  10 import Region
  11
  12 commoncodeVersion = 5.6
  13
  14 #TODO: This is a function dumping ground - needs to be reworked
  15 class ErangeError(Exception):
  16     pass
  17
  18
  19 def getReverseComplement(base):
  20     revComp = {"A": "T",
  21                "T": "A",
  22                "G": "C",
  23                "C": "G",
  24                "N": "N"
  25         }
  26
  27     return revComp[base]
  28
  29
  30 def countDuplicatesInList(listToCheck):
  31     tally = defaultdict(int)
  32     for item in listToCheck:
  33         tally[item] += 1
  34
  35     return tally.items()
  36
  37
  38 def writeLog(logFile, messenger, message):
  39     """ create a log file to write a message from a messenger or append to an existing file.
  40     """
  41     try:
  42         logfile = open(logFile)
  43     except IOError:
  44         logfile = open(logFile, "w")
  45     else:
  46         logfile = open(logFile, "a")
  47
  48     logfile.writelines("%s: [%s] %s\n" % (strftime("%Y-%m-%d %H:%M:%S"), messenger, message))
  49     logfile.close()
  50
  51
  52 def getChromInfoList(chrominfo):
  53     chromInfoList = []
  54     linelist = open(chrominfo)
  55     for line in linelist:
  56         fields = line.strip().split('\t')
  57         chr = fields[0]
  58         start = 0
  59         end = int(fields[1])
  60         chromInfoList.append((chr,start,end))
  61
  62     linelist.close()
  63
  64     return chromInfoList
  65
  66
  67 # BAM file support functions
  68 def isSpliceEntry(cigarTupleList):
  69     isSplice = False
  70     if cigarTupleList is not None:
  71         for operation,length in cigarTupleList:
  72             if operation == 3:
  73                 isSplice = True
  74                 break
  75
  76     return isSplice
  77
  78
  79 def addPairIDtoReadID(alignedread):
  80     ID = alignedread.qname
  81     if alignedread.is_read1:
  82         ID = ID + '/1'
  83
  84     if alignedread.is_read2:
  85         ID = ID + '/2'
  86
  87     return ID
  88
  89
  90 def getHeaderComment(bamHeader, commentKey):
  91     for comment in bamHeader["CO"]:
  92         fields = comment.split("\t")
  93         if fields[0] == commentKey:
  94             return fields[1]
  95
  96     raise KeyError
  97
  98
  99 def getReadSense(read):
 100     if read.is_reverse:
 101         sense = "-"
 102     else:
 103         sense = "+"
 104
 105     return sense
 106
 107 #TODO: is this where we need to take into account the NH > 10 restriction?
 108 def countThisRead(read, countUniqs, countMulti, countSplices, sense):
 109     countRead = True
 110
 111     if sense in ['-', '+'] and sense != getReadSense(read):
 112         countRead = False
 113     elif not countSplices and isSpliceEntry(read.cigar):
 114         countRead = False
 115     elif not countUniqs and read.opt('NH') == 1:
 116         countRead = False
 117     elif not countMulti and read.opt('NH') > 1:
 118         countRead = False
 119
 120     return countRead
 121
 122
 123 # Cistematic functions
 124 def getGeneInfoDict(genome, cache=False):
 125     idb = geneinfoDB(cache=cache)
 126     if genome == "dmelanogaster":
 127         geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus")
 128     else:
 129         geneinfoDict = idb.getallGeneInfo(genome)
 130
 131     return geneinfoDict
 132
 133
 134 def getGeneAnnotDict(genome, inRAM=False):
 135     return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM)
 136
 137
 138 def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRAM=False):
 139     genome = Genome(genomeName, inRAM=inRAM)
 140     if extendGenome != "":
 141         genome.extendFeatures(extendGenome, replace=replaceModels)
 142
 143     geneannotDict = genome.allAnnotInfo()
 144
 145     return geneannotDict
 146
 147
 148 # Configuration File Support
 149 def getConfigParser(fileList=[]):
 150     configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
 151     for filename in fileList:
 152         configFiles.append(filename)
 153
 154     config = ConfigParser.SafeConfigParser()
 155     config.read(configFiles)
 156
 157     return config
 158
 159
 160 def getConfigOption(parser, section, option, default=None):
 161     try:
 162         setting = parser.get(section, option)
 163     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
 164         setting = default
 165
 166     return setting
 167
 168
 169 def getConfigIntOption(parser, section, option, default=None):
 170     try:
 171         setting = parser.getint(section, option)
 172     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
 173         setting = default
 174
 175     return setting
 176
 177
 178 def getConfigFloatOption(parser, section, option, default=None):
 179     try:
 180         setting = parser.getfloat(section, option)
 181     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
 182         setting = default
 183
 184     return setting
 185
 186
 187 def getConfigBoolOption(parser, section, option, default=None):
 188     try:
 189         setting = parser.getboolean(section, option)
 190     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError):
 191         setting = default
 192
 193     return setting
 194
 195
 196 def getAllConfigSectionOptions(parser, section):
 197     try:
 198         setting = parser.items(section)
 199     except ConfigParser.NoSectionError:
 200         setting = []
 201
 202     return setting
 203
 204
 205 # All the Other Stuff from before
 206 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
 207                      fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
 208                      doMerge=True, keepPeak=False, returnTop=0):
 209
 210     """ returns a dictionary containing a list of merged overlapping regions by chromosome;
 211     can optionally filter regions that have a scoreField fewer than minHits.
 212     Can also optionally return the label of each region, as well as the
 213     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
 214     Can return the top regions based on score if higher than minHits.
 215     """
 216     infile = open(regionfilename)
 217     lines = infile.readlines()
 218     regions = getMergedRegionsFromList(lines, maxDist, minHits, verbose, keepLabel,
 219                                        fullChrom, chromField, scoreField, pad, compact,
 220                                        doMerge, keepPeak, returnTop)
 221
 222     infile.close()
 223
 224     return regions
 225
 226
 227 def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
 228                      fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False,
 229                      doMerge=True, keepPeak=False, returnTop=0):
 230     """ returns a dictionary containing a list of merged overlapping regions by chromosome;
 231     can optionally filter regions that have a scoreField fewer than minHits.
 232     Can also optionally return the label of each region, as well as the
 233     peak, if supplied (peakPos and peakHeight should be the last 2 fields).
 234     Can return the top regions based on score if higher than minHits.
 235     """
 236     regions = {}
 237     hasPvalue = 0
 238     hasShift = 0
 239     if 0 < returnTop < len(regionList):
 240         scores = []
 241         for regionEntry in regionList:
 242             if regionEntry[0] == "#":
 243                 if "pvalue" in regionEntry:
 244                     hasPvalue = 1
 245
 246                 if "readShift" in regionEntry:
 247                     hasShift = 1
 248
 249                 continue
 250
 251             fields = regionEntry.strip().split("\t")
 252             hits = float(fields[scoreField].strip())
 253             scores.append(hits)
 254
 255         scores.sort()
 256         returnTop = -1 * returnTop
 257         minScore = scores[returnTop]
 258         if minScore > minHits:
 259             minHits = minScore
 260
 261     mergeCount = 0
 262     chromField = int(chromField)
 263     count = 0
 264     #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
 265     #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
 266     #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
 267     #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
 268     #      be merged with A but that will exit the loop and the merge with C will be missed.
 269     #TODO: Are we going to require sorted region files to have this even work?
 270     for regionEntry in regionList:
 271         if regionEntry[0] == "#":
 272             if "pvalue" in regionEntry:
 273                 hasPvalue = 1
 274
 275             if "readShift" in regionEntry:
 276                 hasShift = 1
 277
 278             continue
 279
 280         fields = regionEntry.strip().split("\t")
 281         if minHits >= 0:
 282             try:
 283                 hits = float(fields[scoreField].strip())
 284             except (IndexError, ValueError):
 285                 continue
 286
 287             if hits < minHits:
 288                 continue
 289
 290         if compact:
 291             (chrom, pos) = fields[chromField].split(":")
 292             (front, back) = pos.split("-")
 293             start = int(front)
 294             stop = int(back)
 295         elif chromField > 1:
 296             label = string.join(fields[:chromField],"\t")
 297             chrom = fields[chromField]
 298             start = int(fields[chromField + 1]) - pad
 299             stop = int(fields[chromField + 2]) + pad
 300         else:
 301             label = fields[0]
 302             chrom = fields[1]
 303             start = int(fields[2]) - pad
 304             stop = int(fields[3]) + pad
 305
 306         if not fullChrom:
 307             chrom = chrom[3:]
 308
 309         if keepPeak:
 310             peakPos = int(fields[-2 - hasPvalue - hasShift])
 311             peakHeight = float(fields[-1 - hasPvalue - hasShift])
 312
 313         if chrom not in regions:
 314             regions[chrom] = []
 315
 316         merged = False
 317
 318         if doMerge and len(regions[chrom]) > 0:
 319             for index in range(len(regions[chrom])):
 320                 region = regions[chrom][index]
 321                 rstart = region.start
 322                 rstop = region.stop
 323                 if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
 324                     if start < rstart:
 325                         rstart = start
 326
 327                     if rstop < stop:
 328                         rstop = stop
 329
 330                     if keepPeak:
 331                         rpeakPos = region.peakPos
 332                         rpeakHeight = region.peakHeight
 333                         if peakHeight > rpeakHeight:
 334                             rpeakHeight = peakHeight
 335                             rpeakPos = peakPos
 336
 337                     regions[chrom][index].start = rstart
 338                     regions[chrom][index].stop = rstop
 339                     regions[chrom][index].length = abs(rstop - rstart)
 340                     if keepLabel:
 341                         regions[chrom][index].label = label
 342
 343                     if keepPeak:
 344                         regions[chrom][index].peakPos = rpeakPos
 345                         regions[chrom][index].peakHeight = rpeakHeight
 346
 347
 348                     mergeCount += 1
 349                     merged = True
 350                     break
 351
 352         if not merged:
 353             region = Region.Region(start, stop)
 354             if keepLabel:
 355                 region.label = label
 356
 357             if keepPeak:
 358                 region.peakPos = peakPos
 359                 region.peakHeight = peakHeight
 360
 361             regions[chrom].append(region)
 362             count += 1
 363
 364         if verbose and (count % 100000 == 0):
 365             print count
 366
 367     regionCount = 0
 368     for chrom in regions:
 369         regionCount += len(regions[chrom])
 370         regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start))
 371
 372     if verbose:
 373         print "merged %d times" % mergeCount
 374         print "returning %d regions" % regionCount
 375
 376     return regions
 377
 378
 379 def regionsOverlap(start, stop, rstart, rstop):
 380     if start > stop:
 381         (start, stop) = (stop, start)
 382
 383     if rstart > rstop:
 384         (rstart, rstop) = (rstop, rstart)
 385
 386     return (rstart <= start <= rstop) or (rstart <= stop <= rstop) or (start <= rstart <= stop) or (start <= rstop <= stop)
 387
 388
 389 def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist):
 390     if start > stop:
 391         (start, stop) = (stop, start)
 392
 393     if rstart > rstop:
 394         (rstart, rstop) = (rstop, rstart)
 395
 396     return (abs(rstart-stop) <= maxDist) or (abs(rstop-start) <= maxDist)
 397
 398
 399 def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
 400              shift=0, maxshift=75):
 401     """ find the peak in a list of reads (hitlist) in a region
 402     of a given length and absolute start point. returns a
 403     list of peaks, the number of hits, a triangular-smoothed
 404     version of hitlist, and the number of reads that are
 405     forward (plus) sense.
 406     If doWeight is True, weight the reads accordingly.
 407     If leftPlus is True, return the number of plus reads left of
 408     the peak, taken to be the first TopPos position.
 409     """
 410
 411     if shift == "auto":
 412         shift = getBestShiftForRegion(hitList, start, length, useWeight=doWeight, maxShift=maxshift)
 413
 414     seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus)
 415
 416     # implementing a triangular smooth
 417     smoothArray = array("f", [0.] * length)
 418     for pos in range(2,length -2):
 419         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
 420
 421     topPos = getPeakPositionList(smoothArray, length)
 422     peak = Peak.Peak(topPos, numHits, smoothArray, numPlus, shift=shift)
 423
 424     if leftPlus:
 425         numLeftPlus = 0
 426         maxPos = topPos[0]
 427         for read in regionArray:
 428             if doWeight:
 429                 weight = read["weight"]
 430             else:
 431                 weight = 1.0
 432
 433             currentPos = read["start"] - start
 434             if currentPos <= maxPos and read["sense"] == "+":
 435                 numLeftPlus += weight
 436
 437         peak.numLeftPlus = numLeftPlus
 438
 439     return peak
 440
 441
 442 def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75):
 443     bestShift = 0
 444     lowestScore = 20000000000
 445     for testShift in xrange(maxShift + 1):
 446         shiftArray = array("f", [0.] * length)
 447         for read in readList:
 448             currentpos = read["start"] - start
 449             if read["sense"] == "+":
 450                 senseModifier = 1
 451             else:
 452                 senseModifier = -1
 453
 454             currentpos += testShift * senseModifier
 455             if (currentpos < 1) or (currentpos >= length):
 456                 continue
 457
 458             if useWeight:
 459                 weight = read["weight"]
 460             else:
 461                 weight = 1.0
 462
 463             shiftArray[currentpos] += weight * senseModifier
 464
 465         currentScore = 0
 466         for score in shiftArray:
 467             currentScore += abs(score)
 468
 469         if currentScore < lowestScore:
 470             bestShift = testShift
 471             lowestScore = currentScore
 472
 473     return bestShift
 474
 475
 476 def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus):
 477     seqArray = array("f", [0.] * length)
 478     numHits = 0.
 479     numPlus = 0.
 480     regionArray = []
 481     for read in hitList:
 482         currentpos = read["start"] - start
 483         if read["sense"] == "+":
 484             currentpos += shift
 485         else:
 486             currentpos -= shift
 487
 488         if (currentpos <  1 - readlen) or (currentpos >= length):
 489             continue
 490
 491         if doWeight:
 492             weight = read["weight"]
 493         else:
 494             weight = 1.0
 495
 496         numHits += weight
 497         if leftPlus:
 498             regionArray.append(read)
 499
 500         hitIndex = 0
 501         while currentpos < 0:
 502             hitIndex += 1
 503             currentpos += 1
 504
 505         while hitIndex < readlen and currentpos < length:
 506             seqArray[currentpos] += weight
 507             hitIndex += 1
 508             currentpos += 1
 509
 510         if read["sense"] == "+":
 511             numPlus += weight
 512
 513     return seqArray, regionArray, numHits, numPlus
 514
 515
 516 def getPeakPositionList(smoothArray, length):
 517     topNucleotide = 0
 518     peakList = []
 519     for currentpos in xrange(length):
 520         if topNucleotide < smoothArray[currentpos]:
 521             topNucleotide = smoothArray[currentpos]
 522             peakList = [currentpos]
 523         elif topNucleotide == smoothArray[currentpos]:
 524             peakList.append(currentpos)
 525
 526     return peakList
 527
 528
 529 def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False,
 530                            restrictList=[], regionComplement=False, maxStop=250000000):
 531     """ return a dictionary of cistematic gene features. Requires
 532     cistematic, obviously. Can filter-out pseudogenes. Will use
 533     additional regions dict to supplement gene models, if available.
 534     Can restrict output to a list of GIDs.
 535     If regionComplement is set to true, returns the regions *outside* of the
 536     calculated boundaries, which is useful for retrieving intronic and
 537     intergenic regions. maxStop is simply used to define the uppermost
 538     boundary of the complement region.
 539     """
 540     featuresDict = genomeObject.getallGeneFeatures()
 541     restrictGID = False
 542     if len(restrictList) > 0:
 543         restrictGID = True
 544
 545     if len(additionalRegionsDict) > 0:
 546         sortList = []
 547         for chrom in additionalRegionsDict:
 548             for region in additionalRegionsDict[chrom]:
 549                 label = region.label
 550                 if label not in sortList:
 551                     sortList.append(label)
 552
 553                 if label not in featuresDict:
 554                     featuresDict[label] = []
 555                     sense = "+"
 556                 else:
 557                     sense = featuresDict[label][0][-1]
 558
 559                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
 560
 561         for gid in sortList:
 562             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
 563
 564     featuresByChromDict = {}
 565     for gid in featuresDict:
 566         if restrictGID and gid not in restrictList:
 567             continue
 568
 569         newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
 570         if newFeatureList and chrom not in featuresByChromDict:
 571             featuresByChromDict[chrom] = []
 572
 573         for (start, stop, ftype) in newFeatureList:
 574             featuresByChromDict[chrom].append((start, stop, gid, sense, ftype))
 575
 576     for chrom in featuresByChromDict:
 577         featuresByChromDict[chrom].sort()
 578
 579     if regionComplement:
 580         complementByChromDict = {}
 581         complementIndex = 0
 582         for chrom in featuresByChromDict:
 583             complementByChromDict[chrom] = []
 584             listLength = len(featuresByChromDict[chrom])
 585             if listLength > 0:
 586                 currentStart = 0
 587                 for index in range(listLength):
 588                     currentStop = featuresByChromDict[chrom][index][0]
 589                     complementIndex += 1
 590                     if currentStart < currentStop:
 591                         complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
 592
 593                     currentStart = featuresByChromDict[chrom][index][1]
 594
 595                 currentStop = maxStop
 596                 complementByChromDict[chrom].append((currentStart, currentStop, "nonExon%d" % complementIndex, "F", "nonExon"))
 597
 598         return (featuresByChromDict, complementByChromDict)
 599     else:
 600         return featuresByChromDict
 601
 602
 603 def getNewFeatureList(featureList, ignorePseudo=False):
 604     newFeatureList = []
 605     for (featureType, chrom, start, stop, sense) in featureList:
 606         if featureType == "PSEUDO" and ignorePseudo:
 607             return [], chrom, sense
 608
 609         if (start, stop, featureType) not in newFeatureList:
 610             notContained = True
 611             containedList = []
 612             for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
 613                 if start >= newFeatureStart and stop <= newFeatureStop:
 614                     notContained = False
 615
 616                 if start < newFeatureStart and stop > newFeatureStop:
 617                     containedList.append((newFeatureStart, newFeatureStop))
 618
 619             if len(containedList) > 0:
 620                 newFList = []
 621                 notContained = True
 622                 for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
 623                     if (newFeatureStart, newFeatureStop) not in containedList:
 624                         newFList.append((newFeatureStart, newFeatureStop, featureType2))
 625                         if start >= newFeatureStart and stop <= newFeatureStop:
 626                             notContained = False
 627
 628                 newFeatureList = newFList
 629
 630             if notContained:
 631                 newFeatureList.append((start, stop, featureType))
 632
 633     return newFeatureList, chrom, sense
 634
 635
 636 def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
 637                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
 638                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
 639     """ return a dictionary of gene loci. Can be used to retrieve additional
 640     sequence upstream or downstream of gene, up to the next gene. Requires
 641     cistematic, obviously.
 642     Can filter-out pseudogenes and use additional regions outside of existing
 643     gene models. Use upstreamSpanTSS to overlap half of the upstream region
 644     over the TSS.
 645     If lengthCDS > 0 bp, e.g. X, return only the starting X bp from CDS. If
 646     lengthCDS < 0bp, return only the last X bp from CDS.
 647     """
 648     locusByChromDict = {}
 649     if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
 650         return locusByChromDict
 651
 652     genomeName = genome.genome
 653     featuresDict = getGeneFeatures(genome, additionalRegionsDict)
 654     for gid in featuresDict:
 655         featureList = featuresDict[gid]
 656         newFeatureList = []
 657         for (ftype, chrom, start, stop, sense) in featureList:
 658             newFeatureList.append((start, stop))
 659
 660         if ignorePseudo and ftype == "PSEUDO":
 661             continue
 662
 663         newFeatureList.sort()
 664
 665         sense = featureList[0][-1]
 666         gstart = newFeatureList[0][0]
 667         gstop = newFeatureList[-1][1]
 668         glen = abs(gstart - gstop)
 669         if sense == "F":
 670             if not useCDS and upstream > 0:
 671                 if upstreamSpanTSS:
 672                     if gstop > (gstart + upstream / 2):
 673                         gstop = gstart + upstream / 2
 674                 else:
 675                     gstop = gstart
 676             elif not useCDS and downstream > 0:
 677                 gstart = gstop
 678
 679             if upstream > 0:
 680                 if upstreamSpanTSS:
 681                     distance = upstream / 2
 682                 else:
 683                     distance = upstream
 684
 685                 if adjustToNeighbor:
 686                     nextGene = genome.leftGeneDistance((genomeName, gid), distance * 2)
 687                     if nextGene < distance * 2:
 688                         distance = nextGene / 2
 689
 690                 distance = max(distance, 1)
 691                 gstart -= distance
 692
 693             if downstream > 0:
 694                 distance = downstream
 695                 if adjustToNeighbor:
 696                     nextGene = genome.rightGeneDistance((genomeName, gid), downstream * 2)
 697                     if nextGene < downstream * 2:
 698                         distance = nextGene / 2
 699
 700                 distance = max(distance, 1)
 701                 gstop += distance
 702
 703             if 0 < lengthCDS < glen:
 704                 gstop = newFeatureList[0][0] + lengthCDS
 705
 706             if lengthCDS < 0 and abs(lengthCDS) < glen:
 707                 gstart = newFeatureList[-1][1] + lengthCDS
 708         else:
 709             if not useCDS and upstream > 0:
 710                 if upstreamSpanTSS:
 711                     if gstart < (gstop - upstream / 2):
 712                         gstart = gstop - upstream / 2
 713                 else:
 714                     gstart = gstop
 715             elif not useCDS and downstream > 0:
 716                     gstop = gstart
 717
 718             if upstream > 0:
 719                 if upstreamSpanTSS:
 720                     distance = upstream /2
 721                 else:
 722                     distance = upstream
 723
 724                 if adjustToNeighbor:
 725                     nextGene = genome.rightGeneDistance((genomeName, gid), distance * 2)
 726                     if nextGene < distance * 2:
 727                         distance = nextGene / 2
 728
 729                 distance = max(distance, 1)
 730                 gstop += distance
 731
 732             if downstream > 0:
 733                 distance = downstream
 734                 if adjustToNeighbor:
 735                     nextGene = genome.leftGeneDistance((genomeName, gid), downstream * 2)
 736                     if nextGene < downstream * 2:
 737                         distance = nextGene / 2
 738
 739                 distance = max(distance, 1)
 740                 gstart -= distance
 741
 742             if 0 < lengthCDS < glen:
 743                 gstart = newFeatureList[-1][-1] - lengthCDS
 744
 745             if lengthCDS < 0 and abs(lengthCDS) < glen:
 746                 gstop = newFeatureList[0][0] - lengthCDS
 747
 748         if chrom not in locusByChromDict:
 749             locusByChromDict[chrom] = []
 750
 751         if keepSense:
 752             locusByChromDict[chrom].append((gstart, gstop, gid, glen, sense))
 753         else:
 754             locusByChromDict[chrom].append((gstart, gstop, gid, glen))
 755
 756     for chrom in locusByChromDict:
 757         locusByChromDict[chrom].sort()
 758
 759     return locusByChromDict
 760
 761
 762 def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
 763     if upstream == 0 and downstream == 0 and not useCDS:
 764         print "getLocusByChromDict: asked for no sequence - returning empty dict"
 765         return True
 766     elif upstream > 0 and downstream > 0 and not useCDS:
 767         print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
 768         return True
 769     elif lengthCDS != 0 and not useCDS:
 770         print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
 771         return True
 772     elif upstreamSpanTSS and lengthCDS != 0:
 773         print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
 774         return True
 775     elif lengthCDS > 0 and downstream > 0:
 776         print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
 777         return True
 778     elif lengthCDS < 0 and upstream > 0:
 779         print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
 780         return True
 781
 782     return False
 783
 784
 785 def getGeneFeatures(genome, additionalRegionsDict):
 786     featuresDict = genome.getallGeneFeatures()
 787     if len(additionalRegionsDict) > 0:
 788         sortList = []
 789         for chrom in additionalRegionsDict:
 790             for region in additionalRegionsDict[chrom]:
 791                 label = region.label
 792                 if label not in sortList:
 793                     sortList.append(label)
 794
 795                 if label not in featuresDict:
 796                     featuresDict[label] = []
 797                     sense = "+"
 798                 else:
 799                     sense = featuresDict[label][0][-1]
 800
 801                 featuresDict[label].append(("custom", chrom, region.start, region.stop, sense))
 802
 803         for gid in sortList:
 804             featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2]))
 805
 806     return featuresDict
 807
 808
 809 def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
 810                       normalizedTag=1., defaultRegionFormat=True, fixedFirstBin=-1,
 811                       binLength=-1):
 812     """ returns 2 dictionaries of bin counts and region lengths, given a dictionary of predefined regions,
 813         a dictionary of reads, a number of bins, the length of reads, and optionally a list of regions
 814         or a different weight / tag.
 815     """
 816     index = 0
 817     regionsBins = {}
 818     regionsLen = {}
 819
 820     if defaultRegionFormat:
 821         regionIDField = 0
 822         startField = 1
 823         stopField = 2
 824         lengthField = 3
 825     else:
 826         startField = 0
 827         stopField = 1
 828         regionIDField = 2
 829         lengthField = 3
 830
 831     senseField = 4
 832
 833     print "entering computeRegionBins"
 834     if len(regionList) > 0:
 835         for readID in regionList:
 836             regionsBins[readID] = [0.] * bins
 837     else:
 838         for chrom in regionsByChromDict:
 839             for regionTuple in regionsByChromDict[chrom]:
 840                 regionID = regionTuple[regionIDField]
 841                 regionsBins[regionID] = [0.] * bins
 842
 843     for chrom in hitDict:
 844         if chrom not in regionsByChromDict:
 845             continue
 846
 847         for regionTuple in regionsByChromDict[chrom]:
 848             regionID = regionTuple[regionIDField]
 849             regionsLen[regionID] = regionTuple[lengthField]
 850
 851         print "%s\n" % chrom
 852         startRegion = 0
 853         for read in hitDict[chrom]:
 854             tagStart = read["start"]
 855             weight = read["weight"]
 856             index += 1
 857             if index % 100000 == 0:
 858                 print "read %d " % index,
 859
 860             stopPoint = tagStart + readlen
 861             if startRegion < 0:
 862                 startRegion = 0
 863
 864             for regionTuple in regionsByChromDict[chrom][startRegion:]:
 865                 start = regionTuple[startField]
 866                 stop = regionTuple[stopField]
 867                 regionID = regionTuple[regionIDField]
 868                 rlen = regionTuple[lengthField]
 869                 try:
 870                     rsense = regionTuple[senseField]
 871                 except IndexError:
 872                     rsense = "F"
 873
 874                 if tagStart > stop:
 875                     startRegion += 1
 876                     continue
 877
 878                 if start > stopPoint:
 879                     startRegion -= 10
 880                     break
 881
 882                 if start <= tagStart <= stop:
 883                     if binLength < 1:
 884                         regionBinLength = rlen / bins
 885                     else:
 886                         regionBinLength = binLength
 887
 888                     if rsense == "F":
 889                         positionInRegion = tagStart - start
 890                     else:
 891                         positionInRegion = start + rlen - tagStart
 892
 893                     # we are relying on python's integer division quirk
 894                     binID = positionInRegion / regionBinLength
 895                     if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
 896                         binID = 0
 897                     elif fixedFirstBin > 0:
 898                         binID = 1
 899
 900                     if binID >= bins:
 901                         binID = bins - 1
 902
 903                     try:
 904                         regionsBins[regionID][binID] += normalizedTag * weight
 905                     except KeyError:
 906                         print "%s %s" % (regionID, str(binID))
 907
 908                     stopPoint = stop
 909
 910     return (regionsBins, regionsLen)