geneDownstreamBins.py

   1 #
   2 #  geneDownstreamBins.py
   3 #  ENRAGE
   4 #
   5
   6 try:
   7     import psyco
   8     psyco.full()
   9 except:
  10     pass
  11
  12 # originally from version 1.3 of geneDnaDownstreamCounts.py
  13 import sys
  14 import optparse
  15 import ReadDataset
  16 from cistematic.genomes import Genome
  17 from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption
  18
  19 print "geneDownstreamBins: version 2.1"
  20
  21 def main(argv=None):
  22     if not argv:
  23         argv = sys.argv
  24
  25     usage = "usage: %prog genome rdsfile outfilename [--max regionSize]"
  26
  27     parser = makeParser(usage)
  28     (options, args) = parser.parse_args(argv[1:])
  29
  30     if len(args) < 3:
  31         print usage
  32         sys.exit(1)
  33
  34     genome = args[0]
  35     hitfile =  args[1]
  36     outfilename = args[2]
  37
  38     geneDownstreamBins(genome, hitfile, outfilename, options.standardMinDist)
  39
  40
  41 def makeParser(usage=""):
  42     parser = optparse.OptionParser(usage=usage)
  43     parser.add_option("--max", type="int", dest="standardMinDist",
  44                       help="maximum region in bp")
  45
  46     configParser = getConfigParser()
  47     section = "geneDownstreamBins"
  48     standardMinDist = getConfigIntOption(configParser, section, "regionSize", 3000)
  49
  50     parser.set_defaults(standardMinDist=standardMinDist)
  51
  52     return parser
  53
  54
  55 def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
  56     bins = 10
  57     standardMinThresh = standardMinDist / bins
  58
  59     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
  60     normalizationFactor = 1.0
  61     if normalize:
  62         hitDictSize = len(hitRDS)
  63         normalizationFactor = hitDictSize / 1000000.
  64
  65     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
  66     geneinfoDict = getGeneInfoDict(genome, cache=True)
  67     hg = Genome(genome)
  68     featuresDict = hg.getallGeneFeatures()
  69
  70     outfile = open(outfilename, "w")
  71
  72     gidList = hg.allGIDs()
  73     gidList.sort()
  74     for gid in gidList:
  75         symbol = "LOC" + gid
  76         geneinfo = ""
  77         featureList = []
  78         try:
  79             geneinfo = geneinfoDict[gid]
  80             featureList = featuresDict[gid]
  81             symbol = geneinfo[0][0]
  82         except:
  83             print gid
  84
  85         if len(featureList) == 0:
  86             continue
  87
  88         newfeatureList = []
  89         for (ftype, chrom, start, stop, fsense) in featureList:
  90             if (start, stop) not in newfeatureList:
  91                 newfeatureList.append((start, stop))
  92
  93         if chrom not in hitDict:
  94             continue
  95
  96         newfeatureList.sort()
  97         if len(newfeatureList) < 1:
  98             continue
  99
 100         glen = standardMinDist
 101         if fsense == "F":
 102             nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
 103             if nextGene < glen * 2:
 104                 glen = nextGene / 2
 105
 106             if glen < 1:
 107                 glen = 1
 108
 109             gstart = newfeatureList[-1][1]
 110         else:
 111             nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
 112             if nextGene < glen * 2:
 113                 glen = nextGene / 2
 114
 115             if glen < 1:
 116                 glen = 1
 117
 118             gstart = newfeatureList[0][0] - glen
 119             if gstart < 0:
 120                 gstart = 0
 121
 122         tagCount = 0
 123         if glen < standardMinDist:
 124             continue
 125
 126         binList = [0.] * bins
 127         for read in hitDict[chrom]:
 128             tagStart = read["start"]
 129             weight = read["weight"]
 130             tagStart -= gstart
 131             if tagStart >= glen:
 132                 break
 133
 134             if tagStart > 0:
 135                 tagCount += weight
 136                 if fsense == "F":
 137                     # we are relying on python's integer division quirk
 138                     binID = tagStart / standardMinThresh
 139                     binList[binID] += weight
 140                 else:
 141                     rdist = glen - tagStart
 142                     binID = rdist / standardMinThresh
 143                     binList[binID] += weight
 144
 145         if tagCount < 2:
 146             continue
 147
 148         tagCount *= normalizationFactor
 149         print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList))
 150         outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen))
 151         for binAmount in binList:
 152             outfile.write("\t%.2f" % binAmount)
 153
 154         outfile.write("\n")
 155
 156     outfile.close()
 157
 158
 159 if __name__ == "__main__":
 160     main(sys.argv)