snapshot of 4.0a development. initial git repo commit
[erange.git] / normalizeFinalExonic.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     pass
6
7 import sys, optparse
8 from commoncode import readDataset
9
10 print "%prog: version 3.5" % sys.argv[0]
11
12 def main(argv=None):
13     if not argv:
14         argv = sys.argv
15
16     usage = "usage: python %prog rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]"
17
18     parser = optparse.OptionParser(usage=usage)
19     parser.add_option("--multifraction", action="store_true", dest="reportfraction")
20     parser.add_option("--multifold", action="store_true", dest="reportFold")
21     parser.add_option("--minrpkm", type="float", dest="minThreshold")
22     parser.add_option("--cache", action="store_true", dest="doCache")
23     parser.add_option("--withGID", action="store_true", dest="writeGID")
24     parser.set_defaults(reportFraction=False, reportFold=False, minThreshold=0.,
25                         doCache=False, writeGID=False)
26
27     (options, args) = parser.parse_args(argv[1:])
28
29     if len(args) < 4:
30         print usage
31         sys.exit(1)
32
33     rdsfilename = argv[1]
34     expandedRPKMfile = args[3]
35     multicountfile = args[2]
36     outfilename = args[3]
37
38     normalizeFinalExonic(rdsfilename, expandedRPKMfile, multicountfile, outfilename,
39                          options.reportFraction, options.reportFold, options.minThreshold,
40                          options.doCache, options.writeGID)
41
42
43 def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename, outfilename,
44                          reportFraction=False, reportFold=False, minThreshold=0., doCache=False,
45                          writeGID=False):
46
47     expandedRPKMfile = open(expandedRPKMfilename)
48     multicountfile = open(multicountfilename)
49
50     if reportFraction:
51         print "reporting fractional contribution of multireads"
52         reportFold = False
53     elif reportFold:
54         print "reporting fold contribution of multireads"
55
56     RDS = readDataset(rdsfilename, verbose=True, cache=doCache, reportCount=False)
57     uniqcount = RDS.getUniqsCount()
58     splicecount = RDS.getSplicesCount()
59     multicount = RDS.getMultiCount()
60     countDict = {}
61     multicountDict = {}
62     lengthDict = {}
63     gidList = []
64
65     uniqspliceCount = (uniqcount + splicecount) / 1000000.
66     totalCount = (uniqcount + splicecount + multicount) / 1000000.
67
68     symbolDict = {}
69
70     for line in expandedRPKMfile:
71         fields = line.strip().split()
72         lineGID = fields[0]
73         symbolDict[lineGID] = fields[1]
74         countDict[lineGID] = float(fields[-1]) * float(fields[-2]) * uniqspliceCount
75         lengthDict[lineGID] = float(fields[-2])
76         multicountDict[lineGID] = 0
77         if lineGID not in gidList:
78             gidList.append(lineGID)
79
80     expandedRPKMfile.close()
81
82     for line in multicountfile:
83         fields = line.strip().split()
84         gid = fields[0]
85         if gid in countDict:
86             countDict[gid] += float(fields[-1])
87             multicountDict[gid] = float(fields[-1])
88         else:
89             print "could not find gid %s in dictionaries" % gid
90
91     multicountfile.close()
92
93     outfile = open(outfilename, "w")
94     outheader = "#"
95     if writeGID:
96         outheader += "GID\t"
97
98     outheader += "gene\tlen_kb\tRPKM"
99     if reportFraction:
100         outheader += "\tmulti/all"
101     elif reportFold:
102         outheader += "\tall/uniq"
103         
104     outheader += "\n"
105     outfile.write(outheader)
106
107     outlineList = []
108     index = 0
109     for gid in gidList:
110         outline = ""
111         gene = symbolDict[gid]
112         rpm = countDict[gid] / totalCount
113         rpkm = rpm / lengthDict[gid]
114         if rpkm < minThreshold:
115             continue
116
117         if writeGID:
118             outline = "%s\t" % gid
119
120         index += 1
121         try:
122             multirpm = multicountDict[gid] / totalCount
123             multirpkm = multirpm / lengthDict[gid]
124         except:
125             print "problem with %s - skipping " % gid
126             continue
127
128         if reportFraction or reportFold:
129             try:
130                 if reportFraction:
131                     multivalue = multirpkm / rpkm
132                 else:
133                     if rpm > multirpm:
134                         uniqrpkm = (rpm - multirpm) / lengthDict[gid]
135                         multivalue = rpkm / uniqrpkm
136                     elif rpkm > 0.01:
137                         multivalue = 100.
138                     else:
139                         multivalue = 1.0
140             except:
141                 multivalue = 0
142
143             outline += "%s\t%.3f\t%.2f\t%.2f\n" %  (gene, lengthDict[gid], rpkm, multivalue)
144             outlineList.append((rpkm, outline))
145         else:
146             outline += "%s\t%.3f\t%.2f\n" %  (gene, lengthDict[gid], rpkm)
147             outlineList.append((rpkm, outline))
148
149     outlineList.sort()
150     outlineList.reverse()
151
152     for (rpkm, line) in outlineList:
153         outfile.write(line)
154
155     outfile.close()
156
157     print "returned %d genes" % index
158
159
160 if __name__ == "__main__":
161     main(sys.argv)