geneUpstreamBins.py

   1 #
   2 #  geneUpstreamBins.py
   3 #  ENRAGE
   4 #
   5 # originally from version 1.3 of geneDownstreamBins.py
   6 try:
   7     import psyco
   8     psyco.full()
   9 except:
  10     pass
  11
  12 import sys, optparse
  13 from commoncode import readDataset
  14 from cistematic.genomes import Genome
  15 from cistematic.core.geneinfo import geneinfoDB
  16
  17 print "%prog: version 2.0"
  18
  19 def main(argv=None):
  20     if not argv:
  21         argv = sys.argv
  22
  23     usage = "usage: python %prog genome rdsfile outfilename [--max regionSize] [--raw] [--cache]"
  24
  25     parser = optparse.OptionParser(usage=usage)
  26     parser.add_option("--raw", action="store_false", dest="normalize",
  27                        help="maximum region in bp")
  28     parser.add_option("--max", type="int", dest="standardMinDist")
  29     parser.add_option("--cache", action="store_true", dest="doCache")
  30     parser.set_defaults(standardMinDist=3000, normalize=True, doCache=False)
  31     (options, args) = parser.parse_args(argv[1:])
  32
  33     if len(args) < 3:
  34         print usage
  35         sys.exit(1)
  36
  37     genome = args[0]
  38     hitfile =  args[1]
  39     outfilename = args[3]
  40
  41     geneUpstreamBins(genome, hitfile, outfilename, options.standardMinDist, options.normalize, options.doCache)
  42
  43
  44 def geneUpstreamBins(genome, hitfile, outfilename, standardMinDist=3000, normalize=True, doCache=False):
  45     bins = 10
  46     standardMinThresh = standardMinDist / bins
  47
  48     hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
  49     normalizationFactor = 1.0
  50     if normalize:
  51         totalCount = len(hitRDS)
  52         normalizationFactor = totalCount / 1000000.
  53
  54     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
  55
  56     hg = Genome(genome)
  57     idb = geneinfoDB(cache=True)
  58
  59     geneinfoDict = idb.getallGeneInfo(genome)
  60     featuresDict = hg.getallGeneFeatures()
  61
  62     outfile = open(outfilename,"w")
  63
  64     gidList = hg.allGIDs()
  65     gidList.sort()
  66     for gid in gidList:
  67         symbol = "LOC" + gid
  68         geneinfo = ""
  69         featureList = []
  70         try:
  71             geneinfo = geneinfoDict[gid]
  72             featureList = featuresDict[gid]
  73             symbol = geneinfo[0][0]
  74         except:
  75             print geneinfo
  76
  77         newfeatureList = []
  78         if len(featureList) == 0:
  79             continue
  80
  81         for (ftype, chrom, start, stop, fsense) in featureList:
  82             if (start, stop) not in newfeatureList:
  83                 newfeatureList.append((start, stop))
  84
  85         if chrom not in hitDict:
  86             continue
  87
  88         newfeatureList.sort()
  89         if len(newfeatureList) < 1:
  90             continue
  91
  92         glen = standardMinDist
  93         if fsense == "F":
  94             nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
  95             if nextGene < glen * 2:
  96                 glen = nextGene / 2
  97
  98             if glen < 1:
  99                 glen = 1
 100
 101             gstart = newfeatureList[0][0] - glen
 102             if gstart < 0:
 103                 gstart = 0
 104
 105         else:
 106             nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
 107             if nextGene < glen * 2:
 108                 glen = nextGene / 2
 109
 110             if glen < 1:
 111                 glen = 1
 112
 113             gstart = newfeatureList[-1][1]
 114
 115         tagCount = 0
 116         if glen < standardMinDist:
 117             continue
 118
 119         binList = [0] * bins
 120         for (tagStart, sense, weight) in hitDict[chrom]:
 121             tagStart -= gstart
 122             if tagStart >= glen:
 123                 break
 124
 125             if tagStart > 0:
 126                 tagCount += weight
 127                 if fsense == "R":
 128                     # we are relying on python's integer division quirk
 129                     binID = tagStart / standardMinThresh
 130                     binList[binID] += weight
 131                 else:
 132                     rdist = glen - tagStart
 133                     binID = rdist / standardMinThresh
 134                     binList[binID] += weight
 135
 136         if tagCount < 2:
 137             continue
 138
 139         print "%s %s %d %d %s" % (gid, symbol, normalizationFactor * tagCount, glen, str(binList))
 140         outfile.write("%s\t%s\t%d\t%d" % (gid, symbol, normalizationFactor * tagCount, glen))
 141         for binAmount in binList:
 142             outfile.write("\t%d" % binAmount)
 143         outfile.write("\n")
 144
 145     outfile.close()
 146
 147
 148 if __name__ == "__main__":
 149     main(sys.argv)