snapshot of 4.0a development. initial git repo commit
[erange.git] / geneStallingBins.py
1 #
2 #
3 #  geneStallingBins.py
4 #  ENRAGE
5 #
6
7 # originally from geneLocusBins.py
8 try:
9     import psyco
10     psyco.full()
11 except:
12     pass
13
14 import sys, optparse
15 from commoncode import readDataset, getMergedRegions, computeRegionBins, getLocusByChromDict
16 from cistematic.genomes import Genome
17 from cistematic.core.geneinfo import geneinfoDB
18
19 print "%prog: version 1.3"
20
21
22 def main(argv=None):
23     if not argv:
24         argv = sys.argv
25
26     usage = "usage: python %s genome rdsfile controlrdsfile outfilename [--upstream bp] [--downstream bp] [--regions acceptfile] [--cache] [--normalize] [--tagCount]"
27
28     parser = optparse.OptionParser(usage=usage)
29     parser.add_option("--upstream", type="int", dest="upstreamBp")
30     parser.add_option("--downstream", type="int", dest="downstreamBp")
31     parser.add_option("--regions", dest="acceptfile")
32     parser.add_option("--cache", action="store_true", dest="doCache")
33     parser.add_option("--normalize", action="store_true", dest="normalize")
34     parser.add_option("--tagCount", action="store_true", dest="doTagCount")
35     parser.add_option("--bins", type="int", dest="bins")
36     parser.set_defaults(upstreamBp=300, downstreamBp=0, acceptfile="",
37                         doCache=False, normalize=False, doTagCount=False, bins=4)
38     (options, args) = parser.parse_args(argv[1:])
39
40     if len(args) < 4:
41         print usage
42         sys.exit(1)
43
44     genome = args[0]
45     hitfile =  args[1]
46     controlfile = args[2]
47     outfilename = args[3]
48
49     geneStallingBins(genome, hitfile, controlfile, outfilename, options.upstreamBp,
50                      options.downstreamBp, options.acceptfile, options.doCache,
51                      options.normalize, options.doTagCount, options.bins)
52
53
54 def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300,
55                      downstreamBp=0, acceptfile="", doCache=False, normalize=False,
56                      doTagCount=False, bins=4):
57
58     acceptDict = {}
59     if acceptfile:
60         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
61
62     doCDS = True
63     limitNeighbor = False
64
65     hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
66     readlen = hitRDS.getReadSize()
67     hitNormalizationFactor = 1.0
68     if normalize:
69         hitDictSize = len(hitRDS)
70         hitNormalizationFactor = hitDictSize / 1000000.
71
72     controlRDS = readDataset(hitfile, verbose=True, cache=doCache)
73     controlNormalizationFactor = 1.0
74     if normalize:
75         controlDictSize = len(hitRDS)
76         controlNormalizationFactor = controlDictSize / 1000000.
77
78     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
79     controlDict = controlRDS.getReadsDict(doMulti=True, findallOptimize=True)
80
81     hg = Genome(genome)
82     idb = geneinfoDB(cache=doCache)
83
84     geneinfoDict = idb.getallGeneInfo(genome)
85     locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
86
87     gidList = hg.allGIDs()
88     gidList.sort()
89     for chrom in acceptDict:
90         for (label, start, stop, length) in acceptDict[chrom]:
91             if label not in gidList:
92                 gidList.append(label)
93
94     (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, hitNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
95     (controlBins, gidLen) = computeRegionBins(locusByChromDict, controlDict, bins, readlen, gidList, controlNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
96
97     outfile = open(outfilename, "w")
98
99     for gid in gidList:
100         if "FAR" not in gid:
101             symbol = "LOC" + gid
102             geneinfo = ""
103             try:
104                 geneinfo = geneinfoDict[gid]
105                 symbol = geneinfo[0][0]
106             except:
107                 pass
108         else:
109             symbol = gid
110
111         if gid in gidBins and gid in gidLen:
112             tagCount = 0.
113             controlCount = 0.
114             for binAmount in gidBins[gid]:
115                 tagCount += binAmount
116
117             for binAmount in controlBins[gid]:
118                 controlCount += abs(binAmount)
119
120             diffCount = tagCount + controlCount
121             if diffCount < 0:
122                 diffCount = 0
123
124             outfile.write("%s\t%s\t%.1f\t%d" % (gid, symbol, diffCount, gidLen[gid]))
125             if (gidLen[gid] - 3 * upstreamBp) < upstreamBp:
126                 outfile.write("\tshort\n")
127                 continue
128
129             TSSbins = (tagCount * (gidBins[gid][0] + gidBins[gid][1]) + controlCount * (controlBins[gid][0] + controlBins[gid][1])) / (upstreamBp / 50.)
130             finalbin = (tagCount * gidBins[gid][-1] + controlCount * controlBins[gid][-1]) / ((gidLen[gid] - 3. * upstreamBp) / 100.)
131             if finalbin <= 0.:
132                 finalbin = 0.01
133
134             if TSSbins < 0:
135                 TSSbins = 0
136
137             ratio =  float(TSSbins)/float(finalbin)
138             for binAmount in gidBins[gid]:
139                 if doTagCount:
140                     binAmount = binAmount * tagCount / 100.
141
142                 if normalize:
143                     if tagCount == 0:
144                         tagCount = 1
145
146                     outfile.write("\t%.1f" % (100. * binAmount / tagCount))
147                 else:
148                     outfile.write("\t%.1f" % binAmount)
149
150         outfile.write("\t%.2f\n" % ratio)
151
152     outfile.close()
153
154
155 if __name__ == "__main__":
156     main(sys.argv)