8 from commoncode import readDataset
10 print "%prog: version 3.5" % sys.argv[0]
16 usage = "usage: python %prog rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]"
18 parser = optparse.OptionParser(usage=usage)
19 parser.add_option("--multifraction", action="store_true", dest="reportfraction")
20 parser.add_option("--multifold", action="store_true", dest="reportFold")
21 parser.add_option("--minrpkm", type="float", dest="minThreshold")
22 parser.add_option("--cache", action="store_true", dest="doCache")
23 parser.add_option("--withGID", action="store_true", dest="writeGID")
24 parser.set_defaults(reportFraction=False, reportFold=False, minThreshold=0.,
25 doCache=False, writeGID=False)
27 (options, args) = parser.parse_args(argv[1:])
34 expandedRPKMfile = args[3]
35 multicountfile = args[2]
38 normalizeFinalExonic(rdsfilename, expandedRPKMfile, multicountfile, outfilename,
39 options.reportFraction, options.reportFold, options.minThreshold,
40 options.doCache, options.writeGID)
43 def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename, outfilename,
44 reportFraction=False, reportFold=False, minThreshold=0., doCache=False,
47 expandedRPKMfile = open(expandedRPKMfilename)
48 multicountfile = open(multicountfilename)
51 print "reporting fractional contribution of multireads"
54 print "reporting fold contribution of multireads"
56 RDS = readDataset(rdsfilename, verbose=True, cache=doCache, reportCount=False)
57 uniqcount = RDS.getUniqsCount()
58 splicecount = RDS.getSplicesCount()
59 multicount = RDS.getMultiCount()
65 uniqspliceCount = (uniqcount + splicecount) / 1000000.
66 totalCount = (uniqcount + splicecount + multicount) / 1000000.
70 for line in expandedRPKMfile:
71 fields = line.strip().split()
73 symbolDict[lineGID] = fields[1]
74 countDict[lineGID] = float(fields[-1]) * float(fields[-2]) * uniqspliceCount
75 lengthDict[lineGID] = float(fields[-2])
76 multicountDict[lineGID] = 0
77 if lineGID not in gidList:
78 gidList.append(lineGID)
80 expandedRPKMfile.close()
82 for line in multicountfile:
83 fields = line.strip().split()
86 countDict[gid] += float(fields[-1])
87 multicountDict[gid] = float(fields[-1])
89 print "could not find gid %s in dictionaries" % gid
91 multicountfile.close()
93 outfile = open(outfilename, "w")
98 outheader += "gene\tlen_kb\tRPKM"
100 outheader += "\tmulti/all"
102 outheader += "\tall/uniq"
105 outfile.write(outheader)
111 gene = symbolDict[gid]
112 rpm = countDict[gid] / totalCount
113 rpkm = rpm / lengthDict[gid]
114 if rpkm < minThreshold:
118 outline = "%s\t" % gid
122 multirpm = multicountDict[gid] / totalCount
123 multirpkm = multirpm / lengthDict[gid]
125 print "problem with %s - skipping " % gid
128 if reportFraction or reportFold:
131 multivalue = multirpkm / rpkm
134 uniqrpkm = (rpm - multirpm) / lengthDict[gid]
135 multivalue = rpkm / uniqrpkm
143 outline += "%s\t%.3f\t%.2f\t%.2f\n" % (gene, lengthDict[gid], rpkm, multivalue)
144 outlineList.append((rpkm, outline))
146 outline += "%s\t%.3f\t%.2f\n" % (gene, lengthDict[gid], rpkm)
147 outlineList.append((rpkm, outline))
150 outlineList.reverse()
152 for (rpkm, line) in outlineList:
157 print "returned %d genes" % index
160 if __name__ == "__main__":