10 from commoncode import getConfigParser, getConfigBoolOption, getConfigFloatOption
12 print "normalizeFinalExonic: version 3.6"
18 usage = "usage: python normalizeFinalExonic rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]"
20 parser = makeParser(usage)
21 (options, args) = parser.parse_args(argv[1:])
28 expandedRPKMfile = args[1]
29 multicountfile = args[2]
33 RDS = ReadDataset.ReadDataset(rdsfilename, verbose=True, cache=options.doCache, reportCount=False)
34 readCounts["uniq"] = RDS.getUniqsCount()
35 readCounts["splice"] = RDS.getSplicesCount()
36 readCounts["multi"] = RDS.getMultiCount()
38 normalizeFinalExonic(readCounts, expandedRPKMfile, multicountfile, outfilename,
39 options.reportFraction, options.reportFold, options.minThreshold,
40 options.doCache, options.writeGID)
43 def makeParser(usage=""):
44 parser = optparse.OptionParser(usage=usage)
45 parser.add_option("--multifraction", action="store_true", dest="reportfraction")
46 parser.add_option("--multifold", action="store_true", dest="reportFold")
47 parser.add_option("--minrpkm", type="float", dest="minThreshold")
48 parser.add_option("--cache", action="store_true", dest="doCache")
49 parser.add_option("--withGID", action="store_true", dest="writeGID")
51 configParser = getConfigParser()
52 section = "normalizeFinalExonic"
53 reportFraction = getConfigBoolOption(configParser, section, "multifraction", False)
54 reportFold = getConfigBoolOption(configParser, section, "reportFold", False)
55 minThreshold = getConfigFloatOption(configParser, section, "minThreshold", 0.)
56 doCache = getConfigBoolOption(configParser, section, "doCache", False)
57 writeGID = getConfigBoolOption(configParser, section, "writeGID", False)
59 parser.set_defaults(reportFraction=reportFraction, reportFold=reportFold, minThreshold=minThreshold,
60 doCache=doCache, writeGID=writeGID)
65 def normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename,
66 reportFraction=False, reportFold=False, minThreshold=0., doCache=False,
69 expandedRPKMfile = open(expandedRPKMfilename)
70 multicountfile = open(multicountfilename)
73 print "reporting fractional contribution of multireads"
76 print "reporting fold contribution of multireads"
78 uniqcount = readCounts["uniq"]
79 splicecount = readCounts["splice"]
80 multicount = readCounts["multi"]
86 uniqspliceCount = (uniqcount + splicecount) / 1000000.
87 totalCount = (uniqcount + splicecount + multicount) / 1000000.
91 for line in expandedRPKMfile:
92 fields = line.strip().split()
94 symbolDict[lineGID] = fields[1]
95 countDict[lineGID] = float(fields[-1]) * float(fields[-2]) * uniqspliceCount
96 lengthDict[lineGID] = float(fields[-2])
97 multicountDict[lineGID] = 0
98 if lineGID not in gidList:
99 gidList.append(lineGID)
101 expandedRPKMfile.close()
103 for line in multicountfile:
104 fields = line.strip().split()
107 countDict[gid] += float(fields[-1])
108 multicountDict[gid] = float(fields[-1])
110 print "could not find gid %s in dictionaries" % gid
112 multicountfile.close()
114 outfile = open(outfilename, "w")
119 outheader += "gene\tlen_kb\tRPKM"
121 outheader += "\tmulti/all"
123 outheader += "\tall/uniq"
126 outfile.write(outheader)
132 gene = symbolDict[gid]
133 rpm = countDict[gid] / totalCount
134 rpkm = rpm / lengthDict[gid]
135 if rpkm < minThreshold:
139 outline = "%s\t" % gid
143 multirpm = multicountDict[gid] / totalCount
144 multirpkm = multirpm / lengthDict[gid]
146 print "problem with %s - skipping " % gid
149 if reportFraction or reportFold:
152 multivalue = multirpkm / rpkm
155 uniqrpkm = (rpm - multirpm) / lengthDict[gid]
156 multivalue = rpkm / uniqrpkm
164 outline += "%s\t%.3f\t%.2f\t%.2f\n" % (gene, lengthDict[gid], rpkm, multivalue)
165 outlineList.append((rpkm, outline))
167 outline += "%s\t%.3f\t%.2f\n" % (gene, lengthDict[gid], rpkm)
168 outlineList.append((rpkm, outline))
171 outlineList.reverse()
173 for (rpkm, line) in outlineList:
178 print "returned %d genes" % index
181 if __name__ == "__main__":