development checkpoint
[erange.git] / normalizeFinalExonic.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     pass
6
7 import sys
8 import optparse
9 import ReadDataset
10 from commoncode import getConfigParser, getConfigBoolOption, getConfigFloatOption
11
12 print "normalizeFinalExonic: version 3.6"
13
14 def main(argv=None):
15     if not argv:
16         argv = sys.argv
17
18     usage = "usage: python %prog rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]"
19
20     parser = makeParser(usage)
21     (options, args) = parser.parse_args(argv[1:])
22
23     if len(args) < 4:
24         print usage
25         sys.exit(1)
26
27     rdsfilename = args[0]
28     expandedRPKMfile = args[1]
29     multicountfile = args[2]
30     outfilename = args[3]
31
32     normalizeFinalExonic(rdsfilename, expandedRPKMfile, multicountfile, outfilename,
33                          options.reportFraction, options.reportFold, options.minThreshold,
34                          options.doCache, options.writeGID)
35
36
37 def makeParser(usage=""):
38     parser = optparse.OptionParser(usage=usage)
39     parser.add_option("--multifraction", action="store_true", dest="reportfraction")
40     parser.add_option("--multifold", action="store_true", dest="reportFold")
41     parser.add_option("--minrpkm", type="float", dest="minThreshold")
42     parser.add_option("--cache", action="store_true", dest="doCache")
43     parser.add_option("--withGID", action="store_true", dest="writeGID")
44
45     configParser = getConfigParser()
46     section = "normalizeFinalExonic"
47     reportFraction = getConfigBoolOption(configParser, section, "multifraction", False)
48     reportFold = getConfigBoolOption(configParser, section, "reportFold", False)
49     minThreshold = getConfigFloatOption(configParser, section, "minThreshold", 0.)
50     doCache = getConfigBoolOption(configParser, section, "doCache", False)
51     writeGID = getConfigBoolOption(configParser, section, "writeGID", False)
52
53     parser.set_defaults(reportFraction=reportFraction, reportFold=reportFold, minThreshold=minThreshold,
54                         doCache=doCache, writeGID=writeGID)
55
56     return parser
57
58
59 def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename, outfilename,
60                          reportFraction=False, reportFold=False, minThreshold=0., doCache=False,
61                          writeGID=False):
62
63     expandedRPKMfile = open(expandedRPKMfilename)
64     multicountfile = open(multicountfilename)
65
66     if reportFraction:
67         print "reporting fractional contribution of multireads"
68         reportFold = False
69     elif reportFold:
70         print "reporting fold contribution of multireads"
71
72     RDS = ReadDataset.ReadDataset(rdsfilename, verbose=True, cache=doCache, reportCount=False)
73     uniqcount = RDS.getUniqsCount()
74     splicecount = RDS.getSplicesCount()
75     multicount = RDS.getMultiCount()
76     countDict = {}
77     multicountDict = {}
78     lengthDict = {}
79     gidList = []
80
81     uniqspliceCount = (uniqcount + splicecount) / 1000000.
82     totalCount = (uniqcount + splicecount + multicount) / 1000000.
83
84     symbolDict = {}
85
86     for line in expandedRPKMfile:
87         fields = line.strip().split()
88         lineGID = fields[0]
89         symbolDict[lineGID] = fields[1]
90         countDict[lineGID] = float(fields[-1]) * float(fields[-2]) * uniqspliceCount
91         lengthDict[lineGID] = float(fields[-2])
92         multicountDict[lineGID] = 0
93         if lineGID not in gidList:
94             gidList.append(lineGID)
95
96     expandedRPKMfile.close()
97
98     for line in multicountfile:
99         fields = line.strip().split()
100         gid = fields[0]
101         if gid in countDict:
102             countDict[gid] += float(fields[-1])
103             multicountDict[gid] = float(fields[-1])
104         else:
105             print "could not find gid %s in dictionaries" % gid
106
107     multicountfile.close()
108
109     outfile = open(outfilename, "w")
110     outheader = "#"
111     if writeGID:
112         outheader += "GID\t"
113
114     outheader += "gene\tlen_kb\tRPKM"
115     if reportFraction:
116         outheader += "\tmulti/all"
117     elif reportFold:
118         outheader += "\tall/uniq"
119         
120     outheader += "\n"
121     outfile.write(outheader)
122
123     outlineList = []
124     index = 0
125     for gid in gidList:
126         outline = ""
127         gene = symbolDict[gid]
128         rpm = countDict[gid] / totalCount
129         rpkm = rpm / lengthDict[gid]
130         if rpkm < minThreshold:
131             continue
132
133         if writeGID:
134             outline = "%s\t" % gid
135
136         index += 1
137         try:
138             multirpm = multicountDict[gid] / totalCount
139             multirpkm = multirpm / lengthDict[gid]
140         except:
141             print "problem with %s - skipping " % gid
142             continue
143
144         if reportFraction or reportFold:
145             try:
146                 if reportFraction:
147                     multivalue = multirpkm / rpkm
148                 else:
149                     if rpm > multirpm:
150                         uniqrpkm = (rpm - multirpm) / lengthDict[gid]
151                         multivalue = rpkm / uniqrpkm
152                     elif rpkm > 0.01:
153                         multivalue = 100.
154                     else:
155                         multivalue = 1.0
156             except:
157                 multivalue = 0
158
159             outline += "%s\t%.3f\t%.2f\t%.2f\n" %  (gene, lengthDict[gid], rpkm, multivalue)
160             outlineList.append((rpkm, outline))
161         else:
162             outline += "%s\t%.3f\t%.2f\n" %  (gene, lengthDict[gid], rpkm)
163             outlineList.append((rpkm, outline))
164
165     outlineList.sort()
166     outlineList.reverse()
167
168     for (rpkm, line) in outlineList:
169         outfile.write(line)
170
171     outfile.close()
172
173     print "returned %d genes" % index
174
175
176 if __name__ == "__main__":
177     main(sys.argv)