geneUpstreamBins.py

   1 #
   2 #  geneUpstreamBins.py
   3 #  ENRAGE
   4 #
   5 # originally from version 1.3 of geneDownstreamBins.py
   6 try:
   7     import psyco
   8     psyco.full()
   9 except:
  10     pass
  11
  12 import sys
  13 import optparse
  14 import ReadDataset
  15 from cistematic.genomes import Genome
  16 from commoncode import getGeneInfoDict, getConfigParser, getConfigBoolOption, getConfigIntOption
  17
  18 print "geneUpstreamBins: version 2.1"
  19
  20 def main(argv=None):
  21     if not argv:
  22         argv = sys.argv
  23
  24     usage = "usage: python %prog genome rdsfile outfilename [--max regionSize] [--raw] [--cache]"
  25
  26     parser = makeParser(usage)
  27     (options, args) = parser.parse_args(argv[1:])
  28
  29     if len(args) < 3:
  30         print usage
  31         sys.exit(1)
  32
  33     genome = args[0]
  34     hitfile =  args[1]
  35     outfilename = args[3]
  36
  37     geneUpstreamBins(genome, hitfile, outfilename, options.standardMinDist, options.normalize, options.doCache)
  38
  39
  40 def makeParser(usage=""):
  41     parser = optparse.OptionParser(usage=usage)
  42     parser.add_option("--raw", action="store_false", dest="normalize",
  43                        help="maximum region in bp")
  44     parser.add_option("--max", type="int", dest="standardMinDist")
  45     parser.add_option("--cache", action="store_true", dest="doCache")
  46
  47     configParser = getConfigParser()
  48     section = "geneUpstreamBins"
  49     standardMinDist = getConfigIntOption(configParser, section, "regionSize", 3000)
  50     normalize = getConfigBoolOption(configParser, section, "normalize", True)
  51     doCache = getConfigBoolOption(configParser, section, "doCache", False)
  52
  53     parser.set_defaults(standardMinDist=standardMinDist, normalize=normalize, doCache=doCache)
  54
  55     return parser
  56
  57
  58 def geneUpstreamBins(genome, hitfile, outfilename, standardMinDist=3000, normalize=True, doCache=False):
  59     bins = 10
  60     standardMinThresh = standardMinDist / bins
  61
  62     hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
  63     normalizationFactor = 1.0
  64     if normalize:
  65         totalCount = len(hitRDS)
  66         normalizationFactor = totalCount / 1000000.
  67
  68     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
  69
  70     hg = Genome(genome)
  71     geneinfoDict = getGeneInfoDict(genome, cache=True)
  72     featuresDict = hg.getallGeneFeatures()
  73
  74     outfile = open(outfilename,"w")
  75
  76     gidList = hg.allGIDs()
  77     gidList.sort()
  78     for gid in gidList:
  79         symbol = "LOC" + gid
  80         geneinfo = ""
  81         featureList = []
  82         try:
  83             geneinfo = geneinfoDict[gid]
  84             featureList = featuresDict[gid]
  85             symbol = geneinfo[0][0]
  86         except:
  87             print geneinfo
  88
  89         newfeatureList = []
  90         if len(featureList) == 0:
  91             continue
  92
  93         for (ftype, chrom, start, stop, fsense) in featureList:
  94             if (start, stop) not in newfeatureList:
  95                 newfeatureList.append((start, stop))
  96
  97         if chrom not in hitDict:
  98             continue
  99
 100         newfeatureList.sort()
 101         if len(newfeatureList) < 1:
 102             continue
 103
 104         glen = standardMinDist
 105         if fsense == "F":
 106             nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
 107             if nextGene < glen * 2:
 108                 glen = nextGene / 2
 109
 110             if glen < 1:
 111                 glen = 1
 112
 113             gstart = newfeatureList[0][0] - glen
 114             if gstart < 0:
 115                 gstart = 0
 116
 117         else:
 118             nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
 119             if nextGene < glen * 2:
 120                 glen = nextGene / 2
 121
 122             if glen < 1:
 123                 glen = 1
 124
 125             gstart = newfeatureList[-1][1]
 126
 127         tagCount = 0
 128         if glen < standardMinDist:
 129             continue
 130
 131         binList = [0] * bins
 132         for read in hitDict[chrom]:
 133             tagStart = read["start"]
 134             weight = read["weight"]
 135             tagStart -= gstart
 136             if tagStart >= glen:
 137                 break
 138
 139             if tagStart > 0:
 140                 tagCount += weight
 141                 if fsense == "R":
 142                     # we are relying on python's integer division quirk
 143                     binID = tagStart / standardMinThresh
 144                     binList[binID] += weight
 145                 else:
 146                     rdist = glen - tagStart
 147                     binID = rdist / standardMinThresh
 148                     binList[binID] += weight
 149
 150         if tagCount < 2:
 151             continue
 152
 153         print "%s %s %d %d %s" % (gid, symbol, normalizationFactor * tagCount, glen, str(binList))
 154         outfile.write("%s\t%s\t%d\t%d" % (gid, symbol, normalizationFactor * tagCount, glen))
 155         for binAmount in binList:
 156             outfile.write("\t%d" % binAmount)
 157         outfile.write("\n")
 158
 159     outfile.close()
 160
 161
 162 if __name__ == "__main__":
 163     main(sys.argv)