geneStallingBins.py

   1 #
   2 #
   3 #  geneStallingBins.py
   4 #  ENRAGE
   5 #
   6
   7 # originally from geneLocusBins.py
   8 try:
   9     import psyco
  10     psyco.full()
  11 except:
  12     pass
  13
  14 import sys, optparse
  15 from commoncode import readDataset, getMergedRegions, computeRegionBins, getLocusByChromDict
  16 from cistematic.genomes import Genome
  17 from cistematic.core.geneinfo import geneinfoDB
  18
  19 print "%prog: version 1.3"
  20
  21
  22 def main(argv=None):
  23     if not argv:
  24         argv = sys.argv
  25
  26     usage = "usage: python %s genome rdsfile controlrdsfile outfilename [--upstream bp] [--downstream bp] [--regions acceptfile] [--cache] [--normalize] [--tagCount]"
  27
  28     parser = optparse.OptionParser(usage=usage)
  29     parser.add_option("--upstream", type="int", dest="upstreamBp")
  30     parser.add_option("--downstream", type="int", dest="downstreamBp")
  31     parser.add_option("--regions", dest="acceptfile")
  32     parser.add_option("--cache", action="store_true", dest="doCache")
  33     parser.add_option("--normalize", action="store_true", dest="normalize")
  34     parser.add_option("--tagCount", action="store_true", dest="doTagCount")
  35     parser.add_option("--bins", type="int", dest="bins")
  36     parser.set_defaults(upstreamBp=300, downstreamBp=0, acceptfile="",
  37                         doCache=False, normalize=False, doTagCount=False, bins=4)
  38     (options, args) = parser.parse_args(argv[1:])
  39
  40     if len(args) < 4:
  41         print usage
  42         sys.exit(1)
  43
  44     genome = args[0]
  45     hitfile =  args[1]
  46     controlfile = args[2]
  47     outfilename = args[3]
  48
  49     geneStallingBins(genome, hitfile, controlfile, outfilename, options.upstreamBp,
  50                      options.downstreamBp, options.acceptfile, options.doCache,
  51                      options.normalize, options.doTagCount, options.bins)
  52
  53
  54 def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300,
  55                      downstreamBp=0, acceptfile="", doCache=False, normalize=False,
  56                      doTagCount=False, bins=4):
  57
  58     acceptDict = {}
  59     if acceptfile:
  60         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
  61
  62     doCDS = True
  63     limitNeighbor = False
  64
  65     hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
  66     readlen = hitRDS.getReadSize()
  67     hitNormalizationFactor = 1.0
  68     if normalize:
  69         hitDictSize = len(hitRDS)
  70         hitNormalizationFactor = hitDictSize / 1000000.
  71
  72     controlRDS = readDataset(hitfile, verbose=True, cache=doCache)
  73     controlNormalizationFactor = 1.0
  74     if normalize:
  75         controlDictSize = len(hitRDS)
  76         controlNormalizationFactor = controlDictSize / 1000000.
  77
  78     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
  79     controlDict = controlRDS.getReadsDict(doMulti=True, findallOptimize=True)
  80
  81     hg = Genome(genome)
  82     idb = geneinfoDB(cache=doCache)
  83
  84     geneinfoDict = idb.getallGeneInfo(genome)
  85     locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
  86
  87     gidList = hg.allGIDs()
  88     gidList.sort()
  89     for chrom in acceptDict:
  90         for (label, start, stop, length) in acceptDict[chrom]:
  91             if label not in gidList:
  92                 gidList.append(label)
  93
  94     (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, hitNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
  95     (controlBins, gidLen) = computeRegionBins(locusByChromDict, controlDict, bins, readlen, gidList, controlNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
  96
  97     outfile = open(outfilename, "w")
  98
  99     for gid in gidList:
 100         if "FAR" not in gid:
 101             symbol = "LOC" + gid
 102             geneinfo = ""
 103             try:
 104                 geneinfo = geneinfoDict[gid]
 105                 symbol = geneinfo[0][0]
 106             except:
 107                 pass
 108         else:
 109             symbol = gid
 110
 111         if gid in gidBins and gid in gidLen:
 112             tagCount = 0.
 113             controlCount = 0.
 114             for binAmount in gidBins[gid]:
 115                 tagCount += binAmount
 116
 117             for binAmount in controlBins[gid]:
 118                 controlCount += abs(binAmount)
 119
 120             diffCount = tagCount + controlCount
 121             if diffCount < 0:
 122                 diffCount = 0
 123
 124             outfile.write("%s\t%s\t%.1f\t%d" % (gid, symbol, diffCount, gidLen[gid]))
 125             if (gidLen[gid] - 3 * upstreamBp) < upstreamBp:
 126                 outfile.write("\tshort\n")
 127                 continue
 128
 129             TSSbins = (tagCount * (gidBins[gid][0] + gidBins[gid][1]) + controlCount * (controlBins[gid][0] + controlBins[gid][1])) / (upstreamBp / 50.)
 130             finalbin = (tagCount * gidBins[gid][-1] + controlCount * controlBins[gid][-1]) / ((gidLen[gid] - 3. * upstreamBp) / 100.)
 131             if finalbin <= 0.:
 132                 finalbin = 0.01
 133
 134             if TSSbins < 0:
 135                 TSSbins = 0
 136
 137             ratio =  float(TSSbins)/float(finalbin)
 138             for binAmount in gidBins[gid]:
 139                 if doTagCount:
 140                     binAmount = binAmount * tagCount / 100.
 141
 142                 if normalize:
 143                     if tagCount == 0:
 144                         tagCount = 1
 145
 146                     outfile.write("\t%.1f" % (100. * binAmount / tagCount))
 147                 else:
 148                     outfile.write("\t%.1f" % binAmount)
 149
 150         outfile.write("\t%.2f\n" % ratio)
 151
 152     outfile.close()
 153
 154
 155 if __name__ == "__main__":
 156     main(sys.argv)