10 from commoncode import getConfigParser, getConfigBoolOption, getConfigFloatOption
12 print "normalizeFinalExonic: version 3.6"
18 usage = "usage: python %prog rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]"
20 parser = makeParser(usage)
21 (options, args) = parser.parse_args(argv[1:])
28 expandedRPKMfile = args[3]
29 multicountfile = args[2]
32 normalizeFinalExonic(rdsfilename, expandedRPKMfile, multicountfile, outfilename,
33 options.reportFraction, options.reportFold, options.minThreshold,
34 options.doCache, options.writeGID)
37 def makeParser(usage=""):
38 parser = optparse.OptionParser(usage=usage)
39 parser.add_option("--multifraction", action="store_true", dest="reportfraction")
40 parser.add_option("--multifold", action="store_true", dest="reportFold")
41 parser.add_option("--minrpkm", type="float", dest="minThreshold")
42 parser.add_option("--cache", action="store_true", dest="doCache")
43 parser.add_option("--withGID", action="store_true", dest="writeGID")
45 configParser = getConfigParser()
46 section = "normalizeFinalExonic"
47 reportFraction = getConfigBoolOption(configParser, section, "multifraction", False)
48 reportFold = getConfigBoolOption(configParser, section, "reportFold", False)
49 minThreshold = getConfigFloatOption(configParser, section, "minThreshold", 0.)
50 doCache = getConfigBoolOption(configParser, section, "doCache", False)
51 writeGID = getConfigBoolOption(configParser, section, "writeGID", False)
53 parser.set_defaults(reportFraction=reportFraction, reportFold=reportFold, minThreshold=minThreshold,
54 doCache=doCache, writeGID=writeGID)
59 def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename, outfilename,
60 reportFraction=False, reportFold=False, minThreshold=0., doCache=False,
63 expandedRPKMfile = open(expandedRPKMfilename)
64 multicountfile = open(multicountfilename)
67 print "reporting fractional contribution of multireads"
70 print "reporting fold contribution of multireads"
72 RDS = ReadDataset.ReadDataset(rdsfilename, verbose=True, cache=doCache, reportCount=False)
73 uniqcount = RDS.getUniqsCount()
74 splicecount = RDS.getSplicesCount()
75 multicount = RDS.getMultiCount()
81 uniqspliceCount = (uniqcount + splicecount) / 1000000.
82 totalCount = (uniqcount + splicecount + multicount) / 1000000.
86 for line in expandedRPKMfile:
87 fields = line.strip().split()
89 symbolDict[lineGID] = fields[1]
90 countDict[lineGID] = float(fields[-1]) * float(fields[-2]) * uniqspliceCount
91 lengthDict[lineGID] = float(fields[-2])
92 multicountDict[lineGID] = 0
93 if lineGID not in gidList:
94 gidList.append(lineGID)
96 expandedRPKMfile.close()
98 for line in multicountfile:
99 fields = line.strip().split()
102 countDict[gid] += float(fields[-1])
103 multicountDict[gid] = float(fields[-1])
105 print "could not find gid %s in dictionaries" % gid
107 multicountfile.close()
109 outfile = open(outfilename, "w")
114 outheader += "gene\tlen_kb\tRPKM"
116 outheader += "\tmulti/all"
118 outheader += "\tall/uniq"
121 outfile.write(outheader)
127 gene = symbolDict[gid]
128 rpm = countDict[gid] / totalCount
129 rpkm = rpm / lengthDict[gid]
130 if rpkm < minThreshold:
134 outline = "%s\t" % gid
138 multirpm = multicountDict[gid] / totalCount
139 multirpkm = multirpm / lengthDict[gid]
141 print "problem with %s - skipping " % gid
144 if reportFraction or reportFold:
147 multivalue = multirpkm / rpkm
150 uniqrpkm = (rpm - multirpm) / lengthDict[gid]
151 multivalue = rpkm / uniqrpkm
159 outline += "%s\t%.3f\t%.2f\t%.2f\n" % (gene, lengthDict[gid], rpkm, multivalue)
160 outlineList.append((rpkm, outline))
162 outline += "%s\t%.3f\t%.2f\n" % (gene, lengthDict[gid], rpkm)
163 outlineList.append((rpkm, outline))
166 outlineList.reverse()
168 for (rpkm, line) in outlineList:
173 print "returned %d genes" % index
176 if __name__ == "__main__":