first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / geneStallingBins.py
1 #
2 #
3 #  geneStallingBins.py
4 #  ENRAGE
5 #
6
7 # originally from geneLocusBins.py
8 try:
9     import psyco
10     psyco.full()
11 except:
12     pass
13
14 import sys
15 import optparse
16 from commoncode import getMergedRegions, computeRegionBins, getLocusByChromDict, getConfigParser, getConfigBoolOption, getConfigIntOption, getConfigOption
17 import ReadDataset
18 from cistematic.genomes import Genome
19 from commoncode import getGeneInfoDict
20
21 print "geneStallingBins: version 1.4"
22
23
24 def main(argv=None):
25     if not argv:
26         argv = sys.argv
27
28     usage = "usage: python %s genome rdsfile controlrdsfile outfilename [--upstream bp] [--downstream bp] [--regions acceptfile] [--cache] [--normalize] [--tagCount]"
29
30     parser = getParser(usage)
31     (options, args) = parser.parse_args(argv[1:])
32
33     if len(args) < 4:
34         print usage
35         sys.exit(1)
36
37     genome = args[0]
38     hitfile =  args[1]
39     controlfile = args[2]
40     outfilename = args[3]
41
42     geneStallingBins(genome, hitfile, controlfile, outfilename, options.upstreamBp,
43                      options.downstreamBp, options.acceptfile, options.doCache,
44                      options.normalize, options.doTagCount, options.bins)
45
46
47 def getParser(usage):
48     parser = optparse.OptionParser(usage=usage)
49     parser.add_option("--upstream", type="int", dest="upstreamBp")
50     parser.add_option("--downstream", type="int", dest="downstreamBp")
51     parser.add_option("--regions", dest="acceptfile")
52     parser.add_option("--cache", action="store_true", dest="doCache")
53     parser.add_option("--normalize", action="store_true", dest="normalize")
54     parser.add_option("--tagCount", action="store_true", dest="doTagCount")
55     parser.add_option("--bins", type="int", dest="bins")
56
57     configParser = getConfigParser()
58     section = "geneStallingBins"
59     upstreamBp = getConfigIntOption(configParser, section, "upstreamBp", 300)
60     downstreamBp = getConfigIntOption(configParser, section, "downstreamBp", 0)
61     acceptfile = getConfigOption(configParser, section, "acceptfile", "")
62     doCache = getConfigBoolOption(configParser, section, "doCache", False)
63     normalize = getConfigBoolOption(configParser, section, "normalize", False)
64     doTagCount = getConfigBoolOption(configParser, section, "doTagCount", False)
65     bins = getConfigIntOption(configParser, section, "bins", 4)
66
67     parser.set_defaults(upstreamBp=upstreamBp, downstreamBp=downstreamBp, acceptfile=acceptfile,
68                         doCache=doCache, normalize=normalize, doTagCount=doTagCount, bins=bins)
69
70     return parser
71
72
73 def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300,
74                      downstreamBp=0, acceptfile="", doCache=False, normalize=False,
75                      doTagCount=False, bins=4):
76
77     acceptDict = {}
78     if acceptfile:
79         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
80
81     doCDS = True
82     limitNeighbor = False
83
84     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
85     readlen = hitRDS.getReadSize()
86     hitNormalizationFactor = 1.0
87     if normalize:
88         hitDictSize = len(hitRDS)
89         hitNormalizationFactor = hitDictSize / 1000000.
90
91     controlRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
92     controlNormalizationFactor = 1.0
93     if normalize:
94         controlDictSize = len(hitRDS)
95         controlNormalizationFactor = controlDictSize / 1000000.
96
97     hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
98     controlDict = controlRDS.getReadsDict(doMulti=True, findallOptimize=True)
99
100     hg = Genome(genome)
101     geneinfoDict = getGeneInfoDict(genome, cache=doCache)
102     locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
103
104     gidList = hg.allGIDs()
105     gidList.sort()
106     for chrom in acceptDict:
107         for region in acceptDict[chrom]:
108             if region.label not in gidList:
109                 gidList.append(region.label)
110
111     (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, hitNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
112     (controlBins, gidLen) = computeRegionBins(locusByChromDict, controlDict, bins, readlen, gidList, controlNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
113
114     outfile = open(outfilename, "w")
115
116     for gid in gidList:
117         if "FAR" not in gid:
118             symbol = "LOC" + gid
119             geneinfo = ""
120             try:
121                 geneinfo = geneinfoDict[gid]
122                 symbol = geneinfo[0][0]
123             except:
124                 pass
125         else:
126             symbol = gid
127
128         if gid in gidBins and gid in gidLen:
129             tagCount = 0.
130             controlCount = 0.
131             for binAmount in gidBins[gid]:
132                 tagCount += binAmount
133
134             for binAmount in controlBins[gid]:
135                 controlCount += abs(binAmount)
136
137             diffCount = tagCount + controlCount
138             if diffCount < 0:
139                 diffCount = 0
140
141             outfile.write("%s\t%s\t%.1f\t%d" % (gid, symbol, diffCount, gidLen[gid]))
142             if (gidLen[gid] - 3 * upstreamBp) < upstreamBp:
143                 outfile.write("\tshort\n")
144                 continue
145
146             TSSbins = (tagCount * (gidBins[gid][0] + gidBins[gid][1]) + controlCount * (controlBins[gid][0] + controlBins[gid][1])) / (upstreamBp / 50.)
147             finalbin = (tagCount * gidBins[gid][-1] + controlCount * controlBins[gid][-1]) / ((gidLen[gid] - 3. * upstreamBp) / 100.)
148             if finalbin <= 0.:
149                 finalbin = 0.01
150
151             if TSSbins < 0:
152                 TSSbins = 0
153
154             ratio =  float(TSSbins)/float(finalbin)
155             for binAmount in gidBins[gid]:
156                 if doTagCount:
157                     binAmount = binAmount * tagCount / 100.
158
159                 if normalize:
160                     if tagCount == 0:
161                         tagCount = 1
162
163                     outfile.write("\t%.1f" % (100. * binAmount / tagCount))
164                 else:
165                     outfile.write("\t%.1f" % binAmount)
166
167         outfile.write("\t%.2f\n" % ratio)
168
169     outfile.close()
170
171
172 if __name__ == "__main__":
173     main(sys.argv)