first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / normalizeFinalExonic.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     pass
6
7 import sys
8 import optparse
9 import ReadDataset
10 from commoncode import getConfigParser, getConfigBoolOption, getConfigFloatOption
11
12 print "normalizeFinalExonic: version 3.6"
13
14 def main(argv=None):
15     if not argv:
16         argv = sys.argv
17
18     usage = "usage: python normalizeFinalExonic rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]"
19
20     parser = makeParser(usage)
21     (options, args) = parser.parse_args(argv[1:])
22
23     if len(args) < 4:
24         print usage
25         sys.exit(1)
26
27     rdsfilename = args[0]
28     expandedRPKMfile = args[1]
29     multicountfile = args[2]
30     outfilename = args[3]
31
32     readCounts = {}
33     RDS = ReadDataset.ReadDataset(rdsfilename, verbose=True, cache=options.doCache, reportCount=False)
34     readCounts["uniq"] = RDS.getUniqsCount()
35     readCounts["splice"] = RDS.getSplicesCount()
36     readCounts["multi"] = RDS.getMultiCount()
37
38     normalizeFinalExonic(readCounts, expandedRPKMfile, multicountfile, outfilename,
39                          options.reportFraction, options.reportFold, options.minThreshold,
40                          options.doCache, options.writeGID)
41
42
43 def makeParser(usage=""):
44     parser = optparse.OptionParser(usage=usage)
45     parser.add_option("--multifraction", action="store_true", dest="reportfraction")
46     parser.add_option("--multifold", action="store_true", dest="reportFold")
47     parser.add_option("--minrpkm", type="float", dest="minThreshold")
48     parser.add_option("--cache", action="store_true", dest="doCache")
49     parser.add_option("--withGID", action="store_true", dest="writeGID")
50
51     configParser = getConfigParser()
52     section = "normalizeFinalExonic"
53     reportFraction = getConfigBoolOption(configParser, section, "multifraction", False)
54     reportFold = getConfigBoolOption(configParser, section, "reportFold", False)
55     minThreshold = getConfigFloatOption(configParser, section, "minThreshold", 0.)
56     doCache = getConfigBoolOption(configParser, section, "doCache", False)
57     writeGID = getConfigBoolOption(configParser, section, "writeGID", False)
58
59     parser.set_defaults(reportFraction=reportFraction, reportFold=reportFold, minThreshold=minThreshold,
60                         doCache=doCache, writeGID=writeGID)
61
62     return parser
63
64
65 def normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename,
66                          reportFraction=False, reportFold=False, minThreshold=0., doCache=False,
67                          writeGID=False):
68
69     expandedRPKMfile = open(expandedRPKMfilename)
70     multicountfile = open(multicountfilename)
71
72     if reportFraction:
73         print "reporting fractional contribution of multireads"
74         reportFold = False
75     elif reportFold:
76         print "reporting fold contribution of multireads"
77
78     uniqcount = readCounts["uniq"]
79     splicecount = readCounts["splice"]
80     multicount = readCounts["multi"]
81     countDict = {}
82     multicountDict = {}
83     lengthDict = {}
84     gidList = []
85
86     uniqspliceCount = (uniqcount + splicecount) / 1000000.
87     totalCount = (uniqcount + splicecount + multicount) / 1000000.
88
89     symbolDict = {}
90
91     for line in expandedRPKMfile:
92         fields = line.strip().split()
93         lineGID = fields[0]
94         symbolDict[lineGID] = fields[1]
95         countDict[lineGID] = float(fields[-1]) * float(fields[-2]) * uniqspliceCount
96         lengthDict[lineGID] = float(fields[-2])
97         multicountDict[lineGID] = 0
98         if lineGID not in gidList:
99             gidList.append(lineGID)
100
101     expandedRPKMfile.close()
102
103     for line in multicountfile:
104         fields = line.strip().split()
105         gid = fields[0]
106         if gid in countDict:
107             countDict[gid] += float(fields[-1])
108             multicountDict[gid] = float(fields[-1])
109         else:
110             print "could not find gid %s in dictionaries" % gid
111
112     multicountfile.close()
113
114     outfile = open(outfilename, "w")
115     outheader = "#"
116     if writeGID:
117         outheader += "GID\t"
118
119     outheader += "gene\tlen_kb\tRPKM"
120     if reportFraction:
121         outheader += "\tmulti/all"
122     elif reportFold:
123         outheader += "\tall/uniq"
124         
125     outheader += "\n"
126     outfile.write(outheader)
127
128     outlineList = []
129     index = 0
130     for gid in gidList:
131         outline = ""
132         gene = symbolDict[gid]
133         rpm = countDict[gid] / totalCount
134         rpkm = rpm / lengthDict[gid]
135         if rpkm < minThreshold:
136             continue
137
138         if writeGID:
139             outline = "%s\t" % gid
140
141         index += 1
142         try:
143             multirpm = multicountDict[gid] / totalCount
144             multirpkm = multirpm / lengthDict[gid]
145         except:
146             print "problem with %s - skipping " % gid
147             continue
148
149         if reportFraction or reportFold:
150             try:
151                 if reportFraction:
152                     multivalue = multirpkm / rpkm
153                 else:
154                     if rpm > multirpm:
155                         uniqrpkm = (rpm - multirpm) / lengthDict[gid]
156                         multivalue = rpkm / uniqrpkm
157                     elif rpkm > 0.01:
158                         multivalue = 100.
159                     else:
160                         multivalue = 1.0
161             except:
162                 multivalue = 0
163
164             outline += "%s\t%.3f\t%.2f\t%.2f\n" %  (gene, lengthDict[gid], rpkm, multivalue)
165             outlineList.append((rpkm, outline))
166         else:
167             outline += "%s\t%.3f\t%.2f\n" %  (gene, lengthDict[gid], rpkm)
168             outlineList.append((rpkm, outline))
169
170     outlineList.sort()
171     outlineList.reverse()
172
173     for (rpkm, line) in outlineList:
174         outfile.write(line)
175
176     outfile.close()
177
178     print "returned %d genes" % index
179
180
181 if __name__ == "__main__":
182     main(sys.argv)